# Heart Disease Dataset


# This data set dates from 1988 and consists of four databases: Cleveland, Hungary, 
# Switzerland, and Long Beach V. It contains 76 attributes, including the predicted 
# attribute, but all published experiments refer to using a subset of 14 of them. 
# The "target" field refers to the presence of heart disease in the patient. 
# It is integer valued 0 = no disease and 1 = disease.

# This is multivariate type of dataset which means providing or involving a variety of 
# separate mathematical or statistical variables, multivariate numerical data analysis. 
# It is composed of 14 attributes.
# 
# One of the major tasks on this dataset is to predict based on the given attributes of a 
# patient that whether that particular person has a heart disease or not and other is the 
# experimental task to diagnose and find out various insights from this dataset which could 
# help in understanding the problem more.


# There are various factors associated in the process of determining whether a person will 
# have a heart disease or not. In this project, we will do the analysis on some hypothesis and
# will come up with some conclusions for the same.

# Dataset columns:

# age: The person’s age in years

# sex: The person’s sex (1 = male, 0 = female)

# cp: chest pain type
# — Value 0: asymptomatic
# — Value 1: atypical angina
# — Value 2: non-anginal pain
# — Value 3: typical angina

# trestbps: The person’s resting blood pressure (mm Hg on admission to the hospital)

# chol: The person’s cholesterol measurement in mg/dl

# fbs: The person’s fasting blood sugar (> 120 mg/dl, 1 = true; 0 = false)

# restecg: resting electrocardiographic results
#   Value 0: showing probable or definite left ventricular hypertrophy by Estes’ criteria
#   Value 1: normal
#   Value 2: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)

# thalach: The person’s maximum heart rate achieved

# exang: Exercise induced angina (1 = yes; 0 = no)

# oldpeak: ST depression induced by exercise relative to rest (‘ST’ relates to positions on the ECG plot. See more here)

# slope: the slope of the peak exercise ST segment — 0: downsloping; 1: flat; 2: upsloping
# 0: downsloping; 1: flat; 2: upsloping

# ca: The number of major vessels (0–3)

# thal: A blood disorder called thalassemia Value 0: NULL (dropped from the dataset previously
#   Value 1: fixed defect (no blood flow in some part of the heart)
#   Value 2: normal blood flow
#   Value 3: reversible defect (a blood flow is observed but it is not normal)

# target: Heart disease (1 = no, 0= yes)


library(magrittr) 
library(dplyr) 
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(forcats)
library(rsample)
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ tibble  3.2.1     ✔ purrr   1.0.1
## ✔ tidyr   1.3.0     ✔ stringr 1.5.0
## ✔ readr   2.1.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::extract()   masks magrittr::extract()
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ purrr::set_names() masks magrittr::set_names()
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
## ✔ broom        1.0.1     ✔ recipes      1.0.5
## ✔ dials        1.2.0     ✔ tune         1.1.1
## ✔ infer        1.0.4     ✔ workflows    1.1.3
## ✔ modeldata    1.1.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.0     ✔ yardstick    1.1.0
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard()  masks purrr::discard()
## ✖ tidyr::extract()   masks magrittr::extract()
## ✖ dplyr::filter()    masks stats::filter()
## ✖ recipes::fixed()   masks stringr::fixed()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ purrr::set_names() masks magrittr::set_names()
## ✖ yardstick::spec()  masks readr::spec()
## ✖ recipes::step()    masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(readr)
library(corrplot)
## corrplot 0.92 loaded
heart <- read_csv("/Users/sarju/Desktop/MITA Sem 2/MVA/Individual_Project/heart.csv")
## Rows: 1025 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (14): age, sex, cp, trestbps, chol, fbs, restecg, thalach, exang, oldpea...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
heart
## # A tibble: 1,025 × 14
##      age   sex    cp trestbps  chol   fbs restecg thalach exang oldpeak slope
##    <dbl> <dbl> <dbl>    <dbl> <dbl> <dbl>   <dbl>   <dbl> <dbl>   <dbl> <dbl>
##  1    52     1     0      125   212     0       1     168     0     1       2
##  2    53     1     0      140   203     1       0     155     1     3.1     0
##  3    70     1     0      145   174     0       1     125     1     2.6     0
##  4    61     1     0      148   203     0       1     161     0     0       2
##  5    62     0     0      138   294     1       1     106     0     1.9     1
##  6    58     0     0      100   248     0       0     122     0     1       1
##  7    58     1     0      114   318     0       2     140     0     4.4     0
##  8    55     1     0      160   289     0       0     145     1     0.8     1
##  9    46     1     0      120   249     0       0     144     0     0.8     2
## 10    54     1     0      122   286     0       0     116     1     3.2     1
## # … with 1,015 more rows, and 3 more variables: ca <dbl>, thal <dbl>,
## #   target <dbl>
dim(heart) # 1025 rows & 14 columns
## [1] 1025   14
colnames(heart)
##  [1] "age"      "sex"      "cp"       "trestbps" "chol"     "fbs"     
##  [7] "restecg"  "thalach"  "exang"    "oldpeak"  "slope"    "ca"      
## [13] "thal"     "target"
str(heart)
## spc_tbl_ [1,025 × 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ age     : num [1:1025] 52 53 70 61 62 58 58 55 46 54 ...
##  $ sex     : num [1:1025] 1 1 1 1 0 0 1 1 1 1 ...
##  $ cp      : num [1:1025] 0 0 0 0 0 0 0 0 0 0 ...
##  $ trestbps: num [1:1025] 125 140 145 148 138 100 114 160 120 122 ...
##  $ chol    : num [1:1025] 212 203 174 203 294 248 318 289 249 286 ...
##  $ fbs     : num [1:1025] 0 1 0 0 1 0 0 0 0 0 ...
##  $ restecg : num [1:1025] 1 0 1 1 1 0 2 0 0 0 ...
##  $ thalach : num [1:1025] 168 155 125 161 106 122 140 145 144 116 ...
##  $ exang   : num [1:1025] 0 1 1 0 0 0 0 1 0 1 ...
##  $ oldpeak : num [1:1025] 1 3.1 2.6 0 1.9 1 4.4 0.8 0.8 3.2 ...
##  $ slope   : num [1:1025] 2 0 0 2 1 1 0 1 2 1 ...
##  $ ca      : num [1:1025] 2 0 0 1 3 0 3 1 0 2 ...
##  $ thal    : num [1:1025] 3 3 3 3 2 2 1 3 3 2 ...
##  $ target  : num [1:1025] 0 0 0 0 0 1 0 0 0 0 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   age = col_double(),
##   ..   sex = col_double(),
##   ..   cp = col_double(),
##   ..   trestbps = col_double(),
##   ..   chol = col_double(),
##   ..   fbs = col_double(),
##   ..   restecg = col_double(),
##   ..   thalach = col_double(),
##   ..   exang = col_double(),
##   ..   oldpeak = col_double(),
##   ..   slope = col_double(),
##   ..   ca = col_double(),
##   ..   thal = col_double(),
##   ..   target = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
summary(heart)
##       age             sex               cp            trestbps    
##  Min.   :29.00   Min.   :0.0000   Min.   :0.0000   Min.   : 94.0  
##  1st Qu.:48.00   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:120.0  
##  Median :56.00   Median :1.0000   Median :1.0000   Median :130.0  
##  Mean   :54.43   Mean   :0.6956   Mean   :0.9424   Mean   :131.6  
##  3rd Qu.:61.00   3rd Qu.:1.0000   3rd Qu.:2.0000   3rd Qu.:140.0  
##  Max.   :77.00   Max.   :1.0000   Max.   :3.0000   Max.   :200.0  
##       chol          fbs            restecg          thalach     
##  Min.   :126   Min.   :0.0000   Min.   :0.0000   Min.   : 71.0  
##  1st Qu.:211   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:132.0  
##  Median :240   Median :0.0000   Median :1.0000   Median :152.0  
##  Mean   :246   Mean   :0.1493   Mean   :0.5298   Mean   :149.1  
##  3rd Qu.:275   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:166.0  
##  Max.   :564   Max.   :1.0000   Max.   :2.0000   Max.   :202.0  
##      exang           oldpeak          slope             ca        
##  Min.   :0.0000   Min.   :0.000   Min.   :0.000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:1.000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.800   Median :1.000   Median :0.0000  
##  Mean   :0.3366   Mean   :1.072   Mean   :1.385   Mean   :0.7541  
##  3rd Qu.:1.0000   3rd Qu.:1.800   3rd Qu.:2.000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :6.200   Max.   :2.000   Max.   :4.0000  
##       thal           target      
##  Min.   :0.000   Min.   :0.0000  
##  1st Qu.:2.000   1st Qu.:0.0000  
##  Median :2.000   Median :1.0000  
##  Mean   :2.324   Mean   :0.5132  
##  3rd Qu.:3.000   3rd Qu.:1.0000  
##  Max.   :3.000   Max.   :1.0000
# str function says that target column is a num type... so will make it a factor of 2 groups
# heart$target <- as.factor(heart$target)

# also converting cp column into factor of 4 groups
# heart$cp <- as.factor(heart$cp)
# 
# heart$sex <- as.character(heart$sex)
str(heart)
## spc_tbl_ [1,025 × 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ age     : num [1:1025] 52 53 70 61 62 58 58 55 46 54 ...
##  $ sex     : num [1:1025] 1 1 1 1 0 0 1 1 1 1 ...
##  $ cp      : num [1:1025] 0 0 0 0 0 0 0 0 0 0 ...
##  $ trestbps: num [1:1025] 125 140 145 148 138 100 114 160 120 122 ...
##  $ chol    : num [1:1025] 212 203 174 203 294 248 318 289 249 286 ...
##  $ fbs     : num [1:1025] 0 1 0 0 1 0 0 0 0 0 ...
##  $ restecg : num [1:1025] 1 0 1 1 1 0 2 0 0 0 ...
##  $ thalach : num [1:1025] 168 155 125 161 106 122 140 145 144 116 ...
##  $ exang   : num [1:1025] 0 1 1 0 0 0 0 1 0 1 ...
##  $ oldpeak : num [1:1025] 1 3.1 2.6 0 1.9 1 4.4 0.8 0.8 3.2 ...
##  $ slope   : num [1:1025] 2 0 0 2 1 1 0 1 2 1 ...
##  $ ca      : num [1:1025] 2 0 0 1 3 0 3 1 0 2 ...
##  $ thal    : num [1:1025] 3 3 3 3 2 2 1 3 3 2 ...
##  $ target  : num [1:1025] 0 0 0 0 0 1 0 0 0 0 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   age = col_double(),
##   ..   sex = col_double(),
##   ..   cp = col_double(),
##   ..   trestbps = col_double(),
##   ..   chol = col_double(),
##   ..   fbs = col_double(),
##   ..   restecg = col_double(),
##   ..   thalach = col_double(),
##   ..   exang = col_double(),
##   ..   oldpeak = col_double(),
##   ..   slope = col_double(),
##   ..   ca = col_double(),
##   ..   thal = col_double(),
##   ..   target = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
# 
# heart[heart$sex==1, 'sex'] <- "male"
# heart[heart$sex==0, 'sex'] <- "female"
# heart$sex


heart_x = subset(heart, select = c(age,cp, trestbps, chol, fbs, restecg, thalach, exang, oldpeak, slope, ca, thal))
heart_x   # heart_x contains all the columns except the target column...
## # A tibble: 1,025 × 12
##      age    cp trestbps  chol   fbs restecg thalach exang oldpeak slope    ca
##    <dbl> <dbl>    <dbl> <dbl> <dbl>   <dbl>   <dbl> <dbl>   <dbl> <dbl> <dbl>
##  1    52     0      125   212     0       1     168     0     1       2     2
##  2    53     0      140   203     1       0     155     1     3.1     0     0
##  3    70     0      145   174     0       1     125     1     2.6     0     0
##  4    61     0      148   203     0       1     161     0     0       2     1
##  5    62     0      138   294     1       1     106     0     1.9     1     3
##  6    58     0      100   248     0       0     122     0     1       1     0
##  7    58     0      114   318     0       2     140     0     4.4     0     3
##  8    55     0      160   289     0       0     145     1     0.8     1     1
##  9    46     0      120   249     0       0     144     0     0.8     2     0
## 10    54     0      122   286     0       0     116     1     3.2     1     2
## # … with 1,015 more rows, and 1 more variable: thal <dbl>
heart_y = heart$target
heart_y  
##    [1] 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 1 1 0 1 1 0 1 1 1 1 0 1 0 0 0 0 1 0 0 1 0 1
##   [38] 1 1 0 1 1 0 0 1 1 1 0 1 0 1 0 1 0 0 0 1 1 0 0 1 1 0 1 1 0 1 0 1 0 0 0 0 0
##   [75] 0 1 1 0 1 1 0 0 0 1 1 1 1 0 0 0 1 1 0 0 1 1 1 0 0 1 1 1 1 1 1 0 0 0 0 0 0
##  [112] 1 0 0 0 0 0 0 1 1 1 0 0 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 0 1 1 0 1 0
##  [149] 1 1 0 0 0 1 0 1 1 1 1 1 0 1 0 0 0 0 0 1 1 1 1 0 1 1 0 0 0 0 0 0 0 1 0 1 1
##  [186] 0 0 0 0 0 1 1 1 1 0 1 0 1 1 0 1 1 1 1 1 1 0 1 1 0 1 0 0 1 1 1 0 1 0 0 0 0
##  [223] 1 1 1 1 0 1 1 0 0 1 0 1 1 1 0 0 0 0 1 0 1 0 1 1 0 0 1 1 0 1 0 0 0 1 1 1 0
##  [260] 1 1 1 1 1 0 1 0 0 0 1 1 1 1 0 1 0 1 1 0 1 1 1 1 1 0 1 1 1 1 0 1 0 1 1 0 0
##  [297] 0 0 1 1 1 1 1 0 1 0 1 1 0 1 0 0 0 1 1 1 1 1 0 1 1 1 0 0 1 1 0 0 0 1 1 0 1
##  [334] 1 0 0 1 1 0 0 1 1 1 1 1 0 0 1 0 0 1 0 0 1 0 1 0 0 0 1 1 1 1 1 0 1 0 0 1 1
##  [371] 0 0 1 0 1 1 1 1 0 1 0 0 0 0 0 1 1 0 0 1 0 0 1 0 0 1 0 0 1 1 0 1 1 1 0 0 1
##  [408] 0 1 0 1 0 0 0 0 1 1 1 1 1 1 1 1 0 0 0 1 1 0 0 0 0 1 1 1 1 0 0 1 0 0 0 0 1
##  [445] 1 1 1 0 1 0 0 1 0 1 0 1 0 1 1 1 0 1 1 1 1 1 1 0 1 0 1 1 1 1 0 1 0 0 1 0 0
##  [482] 0 0 1 0 0 0 0 1 1 1 1 0 0 1 1 0 0 1 1 1 1 1 1 0 1 0 1 0 0 1 0 0 0 1 0 0 1
##  [519] 0 0 0 1 1 0 0 1 0 1 1 1 0 1 0 1 1 1 1 1 0 1 0 1 1 0 1 0 0 1 1 1 0 0 0 1 0
##  [556] 0 0 1 1 0 1 1 1 0 0 1 1 1 1 1 1 0 1 0 0 1 1 0 1 0 1 1 1 0 0 1 0 0 0 0 1 0
##  [593] 0 0 0 0 1 1 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 1 0 0 0 1
##  [630] 0 0 1 1 0 0 1 0 1 0 1 1 0 1 1 1 0 0 1 1 1 0 1 1 0 1 1 0 1 0 1 0 0 1 1 1 1
##  [667] 1 0 1 0 0 0 1 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 1 1 0 1 0 0 1 0 0 0 1 0 1
##  [704] 1 1 0 0 1 1 0 1 0 1 1 1 1 0 0 1 1 0 1 1 1 1 0 0 1 0 1 1 0 1 1 0 1 0 0 0 0
##  [741] 1 1 0 1 1 1 0 0 1 1 1 1 1 1 1 1 0 1 1 0 0 1 1 1 0 0 0 0 1 1 1 1 0 0 1 1 0
##  [778] 0 0 1 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 1 1 0 0 1 0 1 0 0 1 1 0 1 1 1 0 0 0 0
##  [815] 1 1 1 1 1 0 0 0 0 1 0 1 1 0 1 0 1 1 1 0 0 0 1 1 1 1 0 1 0 0 0 0 1 0 1 0 0
##  [852] 1 0 0 0 1 1 1 1 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 0 0 0 1 0 1 1 0 0 0 0 0 0 1
##  [889] 0 0 0 1 0 0 0 0 1 1 1 1 0 1 0 0 1 0 1 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 1 0
##  [926] 0 0 1 0 0 0 0 1 0 1 1 1 0 1 1 0 1 1 1 0 1 1 1 0 1 0 0 1 1 1 1 1 0 1 1 1 1
##  [963] 1 0 1 1 0 1 0 1 1 1 1 1 1 0 0 1 0 0 1 0 1 1 1 1 0 0 0 1 1 0 1 0 0 1 0 0 0
## [1000] 0 0 1 0 1 1 0 1 1 1 0 0 1 0 0 1 0 0 0 0 1 1 0 0 1 0
heart_cm <- colMeans(heart_x)
heart_cm
##         age          cp    trestbps        chol         fbs     restecg 
##  54.4341463   0.9424390 131.6117073 246.0000000   0.1492683   0.5297561 
##     thalach       exang     oldpeak       slope          ca        thal 
## 149.1141463   0.3365854   1.0715122   1.3853659   0.7541463   2.3239024
heart_S <- cov(heart_x)
heart_S
##                  age          cp    trestbps         chol          fbs
## age       82.3064501 -0.67225133  43.0857327  102.8906250  0.392163681
## cp        -0.6722513  1.06016006   0.6885652   -4.3369141  0.029108232
## trestbps  43.0857327  0.68856517 306.8354097  115.6572266  1.135164825
## chol     102.8906250 -4.33691406 115.6572266 2661.7871094  0.495117188
## fbs        0.3921637  0.02910823   1.1351648    0.4951172  0.127111280
## restecg   -0.6354897  0.02368712  -1.1446846   -4.0146484 -0.019582698
## thalach  -81.4460890  7.26829554 -15.8228220  -25.8417969 -0.072719131
## exang      0.3781441 -0.19545065   0.5067978    1.6435547  0.008303163
## oldpeak    2.2188253 -0.21140701   3.8579706    3.9333008  0.004549447
## slope     -0.9477420  0.08372713  -1.3033441   -0.4541016 -0.013633765
## ca         2.5394579 -0.18701696   1.8878420    3.9492188  0.050405869
## thal       0.4070932 -0.10438453   0.6444465    3.2099609 -0.009333079
##               restecg      thalach        exang      oldpeak       slope
## age      -0.635489710 -81.44608899  0.378144055  2.218825267 -0.94774200
## cp        0.023687119   7.26829554 -0.195450648 -0.211407012  0.08372713
## trestbps -1.144684642 -15.82282203  0.506797828  3.857970560 -1.30334413
## chol     -4.014648438 -25.84179688  1.643554688  3.933300781 -0.45410156
## fbs      -0.019582698  -0.07271913  0.008303163  0.004549447 -0.01363377
## restecg   0.278654726   0.58790873 -0.016372904 -0.031085080  0.02807260
## thalach   0.587908727 529.26332508 -4.136113758 -9.456022389  5.61807832
## exang    -0.016372904  -4.13611376  0.223513720  0.172683880 -0.07807736
## oldpeak  -0.031085080  -9.45602239  0.172683880  1.380750152 -0.41752668
## slope     0.028072599   5.61807832 -0.078077363 -0.417526677  0.38162157
## ca       -0.042481898  -4.92991711  0.052558117  0.268672923 -0.04676543
## thal     -0.006717797  -1.40028963  0.057864901  0.147810499 -0.03607565
##                   ca         thal
## age       2.53945789  0.407093178
## cp       -0.18701696 -0.104384527
## trestbps  1.88784204  0.644446456
## chol      3.94921875  3.209960938
## fbs       0.05040587 -0.009333079
## restecg  -0.04248190 -0.006717797
## thalach  -4.92991711 -1.400289634
## exang     0.05255812  0.057864901
## oldpeak   0.26867292  0.147810499
## slope    -0.04676543 -0.036075648
## ca        1.06254383  0.095335366
## thal      0.09533537  0.385219131
d <- apply(heart_x, MARGIN = 1, function(heart_x)t(heart_x - heart_cm) %*% solve(heart_S) %*% (heart_x - heart_cm)) # taking the distance... (distance formula)
d
##    [1]  7.567230 18.324510 15.885833  9.366377 18.787735 10.011945 38.960972
##    [8]  8.675187  8.844229  9.931920 15.991847 21.134232  7.715376 15.765682
##   [15] 27.869344  7.715376  9.132464  6.522142  3.684812 10.241469 10.425369
##   [22] 10.594284 12.479870  6.171794  6.495000  7.624908 10.202440 11.374880
##   [29] 16.229769 29.556340 18.145348  3.684812 10.851811 13.648647  3.416521
##   [36] 12.832129 18.384857  8.411755 10.523994  6.859389 12.145491  7.289091
##   [43]  8.902688  8.844229  7.967498  7.196298 12.799737 21.806891  8.163048
##   [50] 16.860813 13.259467  9.071561 24.410827 13.200681 19.721716 19.721716
##   [57] 11.708776  6.364545 10.263652  5.696189 12.567033  8.163048 13.117883
##   [64] 13.513219 12.567033 15.464898 16.158613 17.385257  5.334974 34.149741
##   [71] 14.786531  9.576794  9.954783 11.934555 14.457960  5.261782 11.923134
##   [78] 21.414340  3.303600  3.303600  6.613851 18.350602 12.832129 24.410827
##   [85]  5.334974  3.083185 10.241469 16.346673 15.758809 19.593698  6.000657
##   [92]  7.796978 21.414340 18.145348 12.685021  8.520165  5.767077 10.739928
##   [99] 14.890842 20.806033  5.423973 18.197598  7.796058  5.261782 16.220477
##  [106]  8.804543 10.528573 10.378807 11.689726 11.227858  8.276915 13.390193
##  [113] 14.457960 11.255072 15.266901  7.624908  6.123407 11.556829 12.567033
##  [120]  5.603233  7.796058 18.145348  6.171322 19.868914  9.499031  8.478883
##  [127]  9.746636 16.158613 18.245952 11.378978  8.478883  4.837180 12.799737
##  [134]  3.416521  7.796058 20.865359  7.967498 20.093110  5.261782  4.893574
##  [141] 13.562466 10.716731 10.643449 11.855798  6.082881 14.074218  5.732385
##  [148] 11.327498 16.634908  9.987618 38.960972 21.131070  8.903341  7.289091
##  [155] 17.164473  7.796058 17.905638  7.289091 47.534084  6.194887 22.692537
##  [162] 10.500501 22.692537  8.300828  9.954783 14.786531 13.875012 12.737300
##  [169] 10.923991  7.741729 10.083886 18.312262 17.571000  7.766130  6.933679
##  [176] 25.882587 18.312262 10.742021  8.276915 14.399568 21.414340 17.978966
##  [183]  9.181265  6.126505  5.783882 19.540936 14.074218  9.954783  6.123407
##  [190] 11.399358  4.916928  5.999869 47.534084 15.882222 12.865842 13.513219
##  [197]  8.870408  7.677090  9.753318  8.730911  7.796978 11.855798 16.634908
##  [204] 15.158247  8.163048 13.711653 15.266901  4.916928 24.410827 10.378807
##  [211] 26.925677 13.117883 11.568543 10.923991  5.783882  3.253317  8.294272
##  [218]  6.194887 11.189485  9.931920  7.210865  6.123407 17.978966  7.766130
##  [225] 11.448772  8.746441 18.350602  6.681593 11.661816 21.806891  8.903341
##  [232]  5.084182  6.075646  6.634512  5.033111 15.158247 18.350602  5.705230
##  [239] 16.860813  8.838920 11.982901 10.263652 24.410827 10.425369 12.954323
##  [246]  5.627663 21.131070 11.189485  8.717593  6.126505  8.804543  7.741729
##  [253]  9.208650 19.540936 10.754012 13.668936 12.214204 20.093110 24.560255
##  [260]  9.746636  3.416345  8.746441  7.631827  7.967498 20.865359 12.479870
##  [267] 10.739928 20.392370 16.297574 17.196970  7.543155  5.322923  8.717593
##  [274] 11.227858 20.685614 18.312262 10.683295  5.627663  8.675187  5.448627
##  [281]  5.467871  5.998285  7.707600 17.978966 16.297574 17.196970 10.715211
##  [288] 11.716112  8.433534 11.689726 18.245952  6.213733 13.318870 13.513219
##  [295] 25.882587 13.117883 20.392370 12.452186  5.998285  6.064092  4.453693
##  [302] 14.523380  7.967498  6.171322  6.646520  8.439129  6.681593  3.083185
##  [309] 15.266901  4.240356  9.499031 14.457960 13.648647 13.858305 17.905638
##  [316] 13.711653 10.715211  6.171794 11.934555 23.360333  5.951457  6.432516
##  [323] 19.016641 25.353496  6.364545  4.240356 21.131070 14.328320 13.736614
##  [330] 23.360333  5.334974 15.429358 19.754734  5.504205  6.541213 12.452186
##  [337] 18.197598 14.207833 16.229769  6.763125 24.410827  3.416345  7.837793
##  [344] 18.016146  6.194887 25.353496 11.568543  6.000657 26.631753 13.137499
##  [351]  9.746636  8.730911 11.255072 12.205877  5.705230  8.939781 28.839256
##  [358] 12.865842  9.622106 23.360333  7.631827 10.686158  5.423973 14.316413
##  [365]  7.210865 14.316413 10.100743 15.488123 13.747349  9.731331 21.134232
##  [372]  9.208650  5.504205  5.730386 16.984642 20.685614  9.368690  8.442907
##  [379] 20.392370 16.220477 10.806579 16.297574  7.232464 12.452186 12.723347
##  [386]  9.731331  6.495000 22.692537 10.742021 20.441713 11.374880 13.748795
##  [393]  9.731331 34.149741 17.164473  8.163048 18.637960 14.074218 20.685614
##  [400] 20.441713 13.200681 15.991847  9.769501  4.453693  9.576794 16.229769
##  [407] 10.241469  8.870408  8.939781 12.832129  5.448627  9.954783 14.328320
##  [414] 13.736614 15.464898  7.707600 14.207833 18.245952  7.796978 12.145491
##  [421]  5.767077 12.924752 12.737300 11.374880 19.016641  6.780655  7.757639
##  [428]  9.933930 26.631753  4.821205 10.263652 13.485158  9.753318 19.754734
##  [435] 12.441599 12.737300 16.346673  8.371008  3.553975  5.730386  8.838920
##  [442]  9.181265 13.562466  9.933930  3.553975 11.661816 14.316413  8.675187
##  [449]  5.732385  4.844452 22.931051  5.619202 21.806891  5.619202 13.485158
##  [456] 15.882222  4.821205  8.717593 10.083886 11.448772 11.419118 11.923134
##  [463] 16.220477  7.543155 47.534084 24.410827  5.627663  8.804543 13.318870
##  [470] 12.865842  8.478883 10.715211  6.784111  7.416307 15.488123 18.197598
##  [477] 19.540936  6.859389  9.987618  8.861752  6.780655 22.931051 15.765682
##  [484]  5.998285 14.890842 10.378807 11.327498  8.294272 11.982901 13.318870
##  [491]  5.767077  9.933930 11.399358  9.208650 12.954323  8.620682 18.637960
##  [498]  5.696189  6.317307 16.984642 15.991847 12.214204 16.984642  5.467871
##  [505]  4.821205  6.681593  9.499031  7.707600 25.882587 29.556340 11.982901
##  [512] 11.556829  7.978755 11.227858  3.083185 13.200681  6.541213  5.448627
##  [519] 13.200681  9.366377 11.934555 17.885162  8.661545  9.366377 10.100743
##  [526]  4.837180 34.149741  7.796978 22.972734 15.882222  8.498527  7.837793
##  [533] 10.716731  5.809265  6.000657 20.806033  3.969666  6.064092  8.844229
##  [540] 15.158247  6.213733  5.440476 13.668936  9.622106 14.207833 15.488123
##  [547] 13.736614 10.594284  5.732385  9.369839 11.419118  9.931920 21.134232
##  [554] 16.158613  8.870408 19.003499 11.934555  7.631827  7.766130 20.392370
##  [561]  6.784111  7.837793  6.495000  7.978755 13.875012  5.809265  6.261242
##  [568]  7.416307  7.757639 26.925677 11.173832  6.171322 11.855798  7.978755
##  [575]  6.075646 10.083886 16.634908 13.736614 13.668936 10.806579 16.798833
##  [582]  5.440476  3.022686 11.556829  9.208650 17.905638 11.399358 28.839256
##  [589]  7.624908  9.931920 13.858305 15.429358 13.648647 15.429358 12.973251
##  [596]  9.366377  9.368690 24.410827  5.730386 10.089017 13.137499 11.189485
##  [603]  8.433534 10.089017  3.553975 17.196970 11.423574 17.385257 12.973251
##  [610] 29.556340 21.134232 11.780865 20.865359 19.721716  6.780655  3.969666
##  [617] 10.606454  6.634512  3.253317 14.890842 13.562466 14.457960 18.787735
##  [624] 10.643449 22.972734 11.419118 16.297574 24.560255 13.164780 13.369841
##  [631] 13.748795  3.253317  5.467871 12.466745  7.567230  5.951457 16.346673
##  [638]  8.433534 13.369841  6.784111  8.939781 14.399568 10.523994 11.944947
##  [645]  3.416345 11.568543  4.821205  8.442907 15.991847  5.467871  8.439129
##  [652]  6.194887 16.380300 18.312262  6.634512  4.893574  7.210865  8.717593
##  [659] 11.399358  8.411755 12.466745 38.960972  6.082881 10.011945 16.380300
##  [666] 19.868914  5.998285  5.705230 12.567033 25.353496  8.902688  7.567230
##  [673]  9.369839  7.289091 17.164473  8.870408  6.763125  6.123407 12.441599
##  [680] 18.637960  5.603233 14.786531 28.839256 11.556829 10.425369 22.931051
##  [687] 27.869344  8.903341 25.882587 11.173832 13.747349  3.696858 18.324510
##  [694] 10.089017  9.574540 12.723347  3.969666 10.263652 11.423574 12.723347
##  [701]  7.416307 10.754012 11.716112 12.205877 14.523380  8.861752  6.859389
##  [708]  4.240356 24.199980 13.748795  5.999869 10.754012  7.741729 16.380300
##  [715] 18.384857  9.769501 11.780865 16.229769  3.696858 19.110606  8.587918
##  [722]  7.677090  8.661545 10.552663 13.858305  8.498527  8.300828  5.999869
##  [729] 11.189485  3.696858  3.083185  4.844452 14.523380  9.018784 27.869344
##  [736]  3.416521  7.232464  8.371008  8.903341  7.370914  5.440476 12.441599
##  [743] 17.164473 17.885162  9.753318 10.500501  8.587918 16.860813  3.416345
##  [750] 17.885162  3.022686 12.145491  6.364545 11.944947  4.837180 11.923134
##  [757] 10.742021 20.685614  4.453693  8.902688 14.328320  5.033111  5.999869
##  [764] 13.259467 21.414340 11.255072 11.255072 19.593698 16.798833 17.196970
##  [771] 12.799737  8.520165  8.838920 13.485158  6.432516 10.500501  7.624908
##  [778] 10.739928  9.622106  7.715376 18.145348 10.806579  8.442907 17.571000
##  [785]  8.746441  8.675187 19.003499 15.765682 18.787735 10.378807  6.933679
##  [792]  6.933679 19.593698 24.199980 10.643449 12.685021 12.799737 13.485158
##  [799] 15.464898 10.606454  8.371008 20.441713 10.716731  4.844452  6.784111
##  [806] 13.390193 14.328320 10.202440 14.523380  5.504205 14.074218  5.705230
##  [813] 15.758809 18.324510 12.685021 13.747349  9.769501  7.677090 13.711653
##  [820] 20.865359  9.576794 15.758809  6.763125  6.317307 12.466745  6.171794
##  [827]  6.126505  6.859389 10.202440  6.522142 10.500501 17.885162  9.369839
##  [834] 19.721716 17.385257 18.350602  5.809265 18.016146 18.384857 11.448772
##  [841] 15.885833 12.214204 10.100743 15.292133  9.181265 13.875012 12.214204
##  [848] 12.466745 13.259467  7.370914  5.730386 19.754734  7.370914  8.371008
##  [855] 13.369841 16.798833 10.552663 10.606454 10.552663 10.528573  4.844452
##  [862]  8.587918 15.464898  7.567230 15.292133  8.478883  4.916928  4.893574
##  [869] 17.571000  5.084182  6.432516  9.987618 15.158247 11.378978 15.292133
##  [876]  6.541213  9.576794  5.423973  9.191463  8.411755 10.683295  5.696189
##  [883] 10.851811  8.300828 15.885833 19.540936  9.499031 12.205877  8.498527
##  [890] 22.931051 11.780865 20.093110  6.933679 27.869344 10.528573 13.137499
##  [897] 22.972734  3.303600 12.924752  8.620682 10.643449 12.924752 15.758809
##  [904] 15.266901  3.022686 10.742021  7.196298  3.684812  8.439129  8.730911
##  [911]  6.613851 11.374880 10.754012 12.479870 11.423574  8.844229  8.294272
##  [918]  3.553975  9.622106 24.560255  9.574540 10.806579  5.322923  6.317307
##  [925]  8.902688 10.851811 11.227858 26.925677  6.522142  6.541213  8.294272
##  [932] 11.689726  9.132464 24.560255  6.126505 17.571000 10.923991  8.587918
##  [939] 13.390193  5.619202  9.071561  6.646520 13.164780 11.944947 21.806891
##  [946] 11.708776  8.661545  7.757639 15.885833 10.683295 12.865842 13.137499
##  [953] 11.173832  8.520165  5.951457 10.686158  5.033111  6.613851 19.868914
##  [960] 13.164780  6.646520 10.011945 19.110606  9.071561  9.018784 20.806033
##  [967]  6.213733 24.199980 18.324510  6.064092 24.410827 18.016146 16.220477
##  [974] 12.954323  7.543155  9.574540 15.429358 11.661816  8.276915 19.016641
##  [981] 11.378978  9.574540 10.594284 10.523994  8.620682 10.686158 29.556340
##  [988]  5.696189  9.181265 11.716112  5.084182 16.860813  6.261242 26.631753
##  [995]  7.232464  5.322923 14.399568  9.191463 17.385257 19.003499 12.973251
## [1002]  7.196298 11.423574 19.110606  9.132464 11.780865 10.241469 11.708776
## [1009]  5.603233 11.689726 10.528573  5.783882 15.488123 38.960972  9.018784
## [1016]  8.861752 13.369841 10.739928 11.327498  6.082881  9.368690  6.075646
## [1023]  8.804543  6.261242  9.191463
# mahalanobis
library(stats)

heart_MD <- mahalanobis(heart_x, heart_cm, heart_S)
heart_MD
##    [1]  7.567230 18.324510 15.885833  9.366377 18.787735 10.011945 38.960972
##    [8]  8.675187  8.844229  9.931920 15.991847 21.134232  7.715376 15.765682
##   [15] 27.869344  7.715376  9.132464  6.522142  3.684812 10.241469 10.425369
##   [22] 10.594284 12.479870  6.171794  6.495000  7.624908 10.202440 11.374880
##   [29] 16.229769 29.556340 18.145348  3.684812 10.851811 13.648647  3.416521
##   [36] 12.832129 18.384857  8.411755 10.523994  6.859389 12.145491  7.289091
##   [43]  8.902688  8.844229  7.967498  7.196298 12.799737 21.806891  8.163048
##   [50] 16.860813 13.259467  9.071561 24.410827 13.200681 19.721716 19.721716
##   [57] 11.708776  6.364545 10.263652  5.696189 12.567033  8.163048 13.117883
##   [64] 13.513219 12.567033 15.464898 16.158613 17.385257  5.334974 34.149741
##   [71] 14.786531  9.576794  9.954783 11.934555 14.457960  5.261782 11.923134
##   [78] 21.414340  3.303600  3.303600  6.613851 18.350602 12.832129 24.410827
##   [85]  5.334974  3.083185 10.241469 16.346673 15.758809 19.593698  6.000657
##   [92]  7.796978 21.414340 18.145348 12.685021  8.520165  5.767077 10.739928
##   [99] 14.890842 20.806033  5.423973 18.197598  7.796058  5.261782 16.220477
##  [106]  8.804543 10.528573 10.378807 11.689726 11.227858  8.276915 13.390193
##  [113] 14.457960 11.255072 15.266901  7.624908  6.123407 11.556829 12.567033
##  [120]  5.603233  7.796058 18.145348  6.171322 19.868914  9.499031  8.478883
##  [127]  9.746636 16.158613 18.245952 11.378978  8.478883  4.837180 12.799737
##  [134]  3.416521  7.796058 20.865359  7.967498 20.093110  5.261782  4.893574
##  [141] 13.562466 10.716731 10.643449 11.855798  6.082881 14.074218  5.732385
##  [148] 11.327498 16.634908  9.987618 38.960972 21.131070  8.903341  7.289091
##  [155] 17.164473  7.796058 17.905638  7.289091 47.534084  6.194887 22.692537
##  [162] 10.500501 22.692537  8.300828  9.954783 14.786531 13.875012 12.737300
##  [169] 10.923991  7.741729 10.083886 18.312262 17.571000  7.766130  6.933679
##  [176] 25.882587 18.312262 10.742021  8.276915 14.399568 21.414340 17.978966
##  [183]  9.181265  6.126505  5.783882 19.540936 14.074218  9.954783  6.123407
##  [190] 11.399358  4.916928  5.999869 47.534084 15.882222 12.865842 13.513219
##  [197]  8.870408  7.677090  9.753318  8.730911  7.796978 11.855798 16.634908
##  [204] 15.158247  8.163048 13.711653 15.266901  4.916928 24.410827 10.378807
##  [211] 26.925677 13.117883 11.568543 10.923991  5.783882  3.253317  8.294272
##  [218]  6.194887 11.189485  9.931920  7.210865  6.123407 17.978966  7.766130
##  [225] 11.448772  8.746441 18.350602  6.681593 11.661816 21.806891  8.903341
##  [232]  5.084182  6.075646  6.634512  5.033111 15.158247 18.350602  5.705230
##  [239] 16.860813  8.838920 11.982901 10.263652 24.410827 10.425369 12.954323
##  [246]  5.627663 21.131070 11.189485  8.717593  6.126505  8.804543  7.741729
##  [253]  9.208650 19.540936 10.754012 13.668936 12.214204 20.093110 24.560255
##  [260]  9.746636  3.416345  8.746441  7.631827  7.967498 20.865359 12.479870
##  [267] 10.739928 20.392370 16.297574 17.196970  7.543155  5.322923  8.717593
##  [274] 11.227858 20.685614 18.312262 10.683295  5.627663  8.675187  5.448627
##  [281]  5.467871  5.998285  7.707600 17.978966 16.297574 17.196970 10.715211
##  [288] 11.716112  8.433534 11.689726 18.245952  6.213733 13.318870 13.513219
##  [295] 25.882587 13.117883 20.392370 12.452186  5.998285  6.064092  4.453693
##  [302] 14.523380  7.967498  6.171322  6.646520  8.439129  6.681593  3.083185
##  [309] 15.266901  4.240356  9.499031 14.457960 13.648647 13.858305 17.905638
##  [316] 13.711653 10.715211  6.171794 11.934555 23.360333  5.951457  6.432516
##  [323] 19.016641 25.353496  6.364545  4.240356 21.131070 14.328320 13.736614
##  [330] 23.360333  5.334974 15.429358 19.754734  5.504205  6.541213 12.452186
##  [337] 18.197598 14.207833 16.229769  6.763125 24.410827  3.416345  7.837793
##  [344] 18.016146  6.194887 25.353496 11.568543  6.000657 26.631753 13.137499
##  [351]  9.746636  8.730911 11.255072 12.205877  5.705230  8.939781 28.839256
##  [358] 12.865842  9.622106 23.360333  7.631827 10.686158  5.423973 14.316413
##  [365]  7.210865 14.316413 10.100743 15.488123 13.747349  9.731331 21.134232
##  [372]  9.208650  5.504205  5.730386 16.984642 20.685614  9.368690  8.442907
##  [379] 20.392370 16.220477 10.806579 16.297574  7.232464 12.452186 12.723347
##  [386]  9.731331  6.495000 22.692537 10.742021 20.441713 11.374880 13.748795
##  [393]  9.731331 34.149741 17.164473  8.163048 18.637960 14.074218 20.685614
##  [400] 20.441713 13.200681 15.991847  9.769501  4.453693  9.576794 16.229769
##  [407] 10.241469  8.870408  8.939781 12.832129  5.448627  9.954783 14.328320
##  [414] 13.736614 15.464898  7.707600 14.207833 18.245952  7.796978 12.145491
##  [421]  5.767077 12.924752 12.737300 11.374880 19.016641  6.780655  7.757639
##  [428]  9.933930 26.631753  4.821205 10.263652 13.485158  9.753318 19.754734
##  [435] 12.441599 12.737300 16.346673  8.371008  3.553975  5.730386  8.838920
##  [442]  9.181265 13.562466  9.933930  3.553975 11.661816 14.316413  8.675187
##  [449]  5.732385  4.844452 22.931051  5.619202 21.806891  5.619202 13.485158
##  [456] 15.882222  4.821205  8.717593 10.083886 11.448772 11.419118 11.923134
##  [463] 16.220477  7.543155 47.534084 24.410827  5.627663  8.804543 13.318870
##  [470] 12.865842  8.478883 10.715211  6.784111  7.416307 15.488123 18.197598
##  [477] 19.540936  6.859389  9.987618  8.861752  6.780655 22.931051 15.765682
##  [484]  5.998285 14.890842 10.378807 11.327498  8.294272 11.982901 13.318870
##  [491]  5.767077  9.933930 11.399358  9.208650 12.954323  8.620682 18.637960
##  [498]  5.696189  6.317307 16.984642 15.991847 12.214204 16.984642  5.467871
##  [505]  4.821205  6.681593  9.499031  7.707600 25.882587 29.556340 11.982901
##  [512] 11.556829  7.978755 11.227858  3.083185 13.200681  6.541213  5.448627
##  [519] 13.200681  9.366377 11.934555 17.885162  8.661545  9.366377 10.100743
##  [526]  4.837180 34.149741  7.796978 22.972734 15.882222  8.498527  7.837793
##  [533] 10.716731  5.809265  6.000657 20.806033  3.969666  6.064092  8.844229
##  [540] 15.158247  6.213733  5.440476 13.668936  9.622106 14.207833 15.488123
##  [547] 13.736614 10.594284  5.732385  9.369839 11.419118  9.931920 21.134232
##  [554] 16.158613  8.870408 19.003499 11.934555  7.631827  7.766130 20.392370
##  [561]  6.784111  7.837793  6.495000  7.978755 13.875012  5.809265  6.261242
##  [568]  7.416307  7.757639 26.925677 11.173832  6.171322 11.855798  7.978755
##  [575]  6.075646 10.083886 16.634908 13.736614 13.668936 10.806579 16.798833
##  [582]  5.440476  3.022686 11.556829  9.208650 17.905638 11.399358 28.839256
##  [589]  7.624908  9.931920 13.858305 15.429358 13.648647 15.429358 12.973251
##  [596]  9.366377  9.368690 24.410827  5.730386 10.089017 13.137499 11.189485
##  [603]  8.433534 10.089017  3.553975 17.196970 11.423574 17.385257 12.973251
##  [610] 29.556340 21.134232 11.780865 20.865359 19.721716  6.780655  3.969666
##  [617] 10.606454  6.634512  3.253317 14.890842 13.562466 14.457960 18.787735
##  [624] 10.643449 22.972734 11.419118 16.297574 24.560255 13.164780 13.369841
##  [631] 13.748795  3.253317  5.467871 12.466745  7.567230  5.951457 16.346673
##  [638]  8.433534 13.369841  6.784111  8.939781 14.399568 10.523994 11.944947
##  [645]  3.416345 11.568543  4.821205  8.442907 15.991847  5.467871  8.439129
##  [652]  6.194887 16.380300 18.312262  6.634512  4.893574  7.210865  8.717593
##  [659] 11.399358  8.411755 12.466745 38.960972  6.082881 10.011945 16.380300
##  [666] 19.868914  5.998285  5.705230 12.567033 25.353496  8.902688  7.567230
##  [673]  9.369839  7.289091 17.164473  8.870408  6.763125  6.123407 12.441599
##  [680] 18.637960  5.603233 14.786531 28.839256 11.556829 10.425369 22.931051
##  [687] 27.869344  8.903341 25.882587 11.173832 13.747349  3.696858 18.324510
##  [694] 10.089017  9.574540 12.723347  3.969666 10.263652 11.423574 12.723347
##  [701]  7.416307 10.754012 11.716112 12.205877 14.523380  8.861752  6.859389
##  [708]  4.240356 24.199980 13.748795  5.999869 10.754012  7.741729 16.380300
##  [715] 18.384857  9.769501 11.780865 16.229769  3.696858 19.110606  8.587918
##  [722]  7.677090  8.661545 10.552663 13.858305  8.498527  8.300828  5.999869
##  [729] 11.189485  3.696858  3.083185  4.844452 14.523380  9.018784 27.869344
##  [736]  3.416521  7.232464  8.371008  8.903341  7.370914  5.440476 12.441599
##  [743] 17.164473 17.885162  9.753318 10.500501  8.587918 16.860813  3.416345
##  [750] 17.885162  3.022686 12.145491  6.364545 11.944947  4.837180 11.923134
##  [757] 10.742021 20.685614  4.453693  8.902688 14.328320  5.033111  5.999869
##  [764] 13.259467 21.414340 11.255072 11.255072 19.593698 16.798833 17.196970
##  [771] 12.799737  8.520165  8.838920 13.485158  6.432516 10.500501  7.624908
##  [778] 10.739928  9.622106  7.715376 18.145348 10.806579  8.442907 17.571000
##  [785]  8.746441  8.675187 19.003499 15.765682 18.787735 10.378807  6.933679
##  [792]  6.933679 19.593698 24.199980 10.643449 12.685021 12.799737 13.485158
##  [799] 15.464898 10.606454  8.371008 20.441713 10.716731  4.844452  6.784111
##  [806] 13.390193 14.328320 10.202440 14.523380  5.504205 14.074218  5.705230
##  [813] 15.758809 18.324510 12.685021 13.747349  9.769501  7.677090 13.711653
##  [820] 20.865359  9.576794 15.758809  6.763125  6.317307 12.466745  6.171794
##  [827]  6.126505  6.859389 10.202440  6.522142 10.500501 17.885162  9.369839
##  [834] 19.721716 17.385257 18.350602  5.809265 18.016146 18.384857 11.448772
##  [841] 15.885833 12.214204 10.100743 15.292133  9.181265 13.875012 12.214204
##  [848] 12.466745 13.259467  7.370914  5.730386 19.754734  7.370914  8.371008
##  [855] 13.369841 16.798833 10.552663 10.606454 10.552663 10.528573  4.844452
##  [862]  8.587918 15.464898  7.567230 15.292133  8.478883  4.916928  4.893574
##  [869] 17.571000  5.084182  6.432516  9.987618 15.158247 11.378978 15.292133
##  [876]  6.541213  9.576794  5.423973  9.191463  8.411755 10.683295  5.696189
##  [883] 10.851811  8.300828 15.885833 19.540936  9.499031 12.205877  8.498527
##  [890] 22.931051 11.780865 20.093110  6.933679 27.869344 10.528573 13.137499
##  [897] 22.972734  3.303600 12.924752  8.620682 10.643449 12.924752 15.758809
##  [904] 15.266901  3.022686 10.742021  7.196298  3.684812  8.439129  8.730911
##  [911]  6.613851 11.374880 10.754012 12.479870 11.423574  8.844229  8.294272
##  [918]  3.553975  9.622106 24.560255  9.574540 10.806579  5.322923  6.317307
##  [925]  8.902688 10.851811 11.227858 26.925677  6.522142  6.541213  8.294272
##  [932] 11.689726  9.132464 24.560255  6.126505 17.571000 10.923991  8.587918
##  [939] 13.390193  5.619202  9.071561  6.646520 13.164780 11.944947 21.806891
##  [946] 11.708776  8.661545  7.757639 15.885833 10.683295 12.865842 13.137499
##  [953] 11.173832  8.520165  5.951457 10.686158  5.033111  6.613851 19.868914
##  [960] 13.164780  6.646520 10.011945 19.110606  9.071561  9.018784 20.806033
##  [967]  6.213733 24.199980 18.324510  6.064092 24.410827 18.016146 16.220477
##  [974] 12.954323  7.543155  9.574540 15.429358 11.661816  8.276915 19.016641
##  [981] 11.378978  9.574540 10.594284 10.523994  8.620682 10.686158 29.556340
##  [988]  5.696189  9.181265 11.716112  5.084182 16.860813  6.261242 26.631753
##  [995]  7.232464  5.322923 14.399568  9.191463 17.385257 19.003499 12.973251
## [1002]  7.196298 11.423574 19.110606  9.132464 11.780865 10.241469 11.708776
## [1009]  5.603233 11.689726 10.528573  5.783882 15.488123 38.960972  9.018784
## [1016]  8.861752 13.369841 10.739928 11.327498  6.082881  9.368690  6.075646
## [1023]  8.804543  6.261242  9.191463
heart$pvalues <- pchisq(heart_MD, df=3, lower.tail=FALSE)
heart
## # A tibble: 1,025 × 15
##      age   sex    cp trestbps  chol   fbs restecg thalach exang oldpeak slope
##    <dbl> <dbl> <dbl>    <dbl> <dbl> <dbl>   <dbl>   <dbl> <dbl>   <dbl> <dbl>
##  1    52     1     0      125   212     0       1     168     0     1       2
##  2    53     1     0      140   203     1       0     155     1     3.1     0
##  3    70     1     0      145   174     0       1     125     1     2.6     0
##  4    61     1     0      148   203     0       1     161     0     0       2
##  5    62     0     0      138   294     1       1     106     0     1.9     1
##  6    58     0     0      100   248     0       0     122     0     1       1
##  7    58     1     0      114   318     0       2     140     0     4.4     0
##  8    55     1     0      160   289     0       0     145     1     0.8     1
##  9    46     1     0      120   249     0       0     144     0     0.8     2
## 10    54     1     0      122   286     0       0     116     1     3.2     1
## # … with 1,015 more rows, and 4 more variables: ca <dbl>, thal <dbl>,
## #   target <dbl>, pvalues <dbl>
# Create a matrix of categorical variables
# heart_cat = subset(heart, select = c(sex,cp, fbs, restecg, exang, slope, ca, thal))
# 
# heart_cat_matrix <- as.matrix(heart_cat)
# heart_cat_matrix
# 
# dim(heart_cat_matrix)
# length(heart$target)


# The warning message suggests that the chi-squared approximation may be incorrect, 
# which can happen when the expected frequency counts are too low. One way to address this 
# is to merge categories with low frequencies into a single category.
# For example, let's say that the categories "0" and "1" have low frequencies in the "thal" column. 
# We can merge them into a single category "0/1" using the following code:

heart$thal
##    [1] 3 3 3 3 2 2 1 3 3 2 2 3 2 3 0 2 2 3 2 2 2 2 2 2 2 3 2 2 1 2 1 2 3 3 2 2 2
##   [38] 2 3 3 2 3 2 3 2 2 1 3 2 3 2 3 2 3 3 3 3 2 3 2 2 2 2 2 2 3 2 1 2 3 3 3 2 3
##   [75] 3 2 2 3 2 2 3 2 2 2 2 2 2 2 2 3 2 2 3 1 2 2 2 3 1 2 2 3 3 2 1 2 3 3 3 2 2
##  [112] 3 3 3 1 3 3 3 2 2 3 1 3 2 3 2 2 2 2 1 2 2 1 2 3 1 2 2 2 2 3 2 2 2 2 3 2 3
##  [149] 3 2 1 3 3 3 3 3 3 3 3 2 2 2 2 3 2 3 1 2 2 2 2 3 2 2 3 3 3 2 2 3 3 2 3 2 2
##  [186] 3 3 2 3 3 2 3 3 2 2 2 3 2 2 3 2 2 3 3 2 2 1 2 2 3 3 2 3 2 2 2 3 2 3 2 2 3
##  [223] 2 2 2 3 2 2 2 3 3 2 3 2 2 3 2 3 3 2 2 3 2 2 2 2 3 3 2 2 2 2 3 3 3 2 2 2 3
##  [260] 2 2 3 2 2 1 2 3 2 3 2 3 3 2 2 1 3 3 2 3 2 2 2 2 2 3 2 3 2 2 3 2 3 2 2 3 2
##  [297] 2 3 2 2 2 3 2 3 2 3 2 2 1 2 3 3 3 2 3 2 3 2 3 0 2 2 3 1 2 2 3 1 2 0 2 2 2
##  [334] 2 3 3 3 2 1 3 2 2 2 3 2 1 3 2 3 3 2 3 3 1 3 2 1 2 2 0 2 3 2 2 2 2 3 3 3 2
##  [371] 3 3 2 2 2 1 2 2 2 1 3 3 3 3 3 2 2 2 2 1 2 3 2 3 3 2 3 3 1 1 3 2 2 2 3 1 2
##  [408] 3 2 2 2 2 1 2 3 2 2 2 2 2 2 2 2 2 3 3 2 2 3 2 3 3 2 2 2 2 2 3 2 2 2 3 3 2
##  [445] 2 2 2 3 2 2 3 2 3 2 3 2 2 2 2 2 3 2 1 3 3 2 2 2 2 2 2 3 2 2 3 3 3 3 2 3 3
##  [482] 3 3 2 1 3 3 3 2 2 2 2 3 3 2 3 3 2 2 2 2 2 2 2 2 2 3 2 3 2 2 3 2 2 2 3 3 2
##  [519] 3 3 3 3 2 3 3 2 3 2 3 2 3 2 2 2 2 2 2 2 3 3 3 2 2 2 2 3 2 2 2 3 3 2 3 2 3
##  [556] 3 3 2 2 2 2 2 2 2 1 2 2 2 2 3 2 3 2 2 3 2 3 2 2 3 3 2 2 3 3 3 3 1 3 2 2 2
##  [593] 3 2 1 3 2 2 2 2 3 3 2 2 2 2 2 1 1 2 3 3 1 3 3 2 2 2 2 1 3 3 2 2 3 3 3 3 2
##  [630] 2 3 2 2 2 3 2 2 2 2 2 2 3 3 3 2 3 2 2 2 2 3 2 2 3 2 2 2 2 3 2 2 1 2 2 2 2
##  [667] 2 3 2 1 2 3 3 3 3 3 3 3 2 3 2 3 1 3 2 3 0 3 3 2 3 2 3 2 3 3 2 3 2 3 2 3 2
##  [704] 1 3 3 3 2 2 3 3 3 2 2 2 2 3 1 2 3 2 2 2 2 2 3 3 3 3 2 2 2 3 2 0 2 3 3 3 3
##  [741] 2 2 3 3 2 2 2 3 2 3 2 2 2 3 2 2 2 1 2 2 1 2 3 2 3 3 3 3 3 2 1 2 2 3 2 2 3
##  [778] 3 2 2 1 3 2 2 3 3 3 3 2 3 3 3 3 2 2 2 1 3 3 2 3 1 2 2 2 3 1 2 3 2 3 3 2 3
##  [815] 2 3 2 2 2 1 3 2 3 2 2 2 2 3 2 3 2 3 3 3 1 2 2 3 2 2 3 2 3 2 3 1 2 2 2 3 2
##  [852] 2 3 3 2 3 2 2 2 3 2 2 3 3 2 2 2 2 2 2 2 2 3 1 2 3 3 2 3 2 3 2 3 3 3 3 3 1
##  [889] 3 3 3 2 3 0 3 3 3 2 2 3 2 2 2 1 2 2 2 2 3 3 3 2 3 2 2 3 3 2 2 3 3 3 3 2 2
##  [926] 3 2 3 3 3 3 3 2 3 2 2 2 2 3 2 3 2 2 3 3 3 2 2 3 3 2 3 2 2 2 3 2 3 2 2 2 2
##  [963] 3 3 2 2 3 2 3 2 2 3 1 2 3 3 2 2 2 3 1 3 2 3 3 3 2 2 3 2 2 3 2 3 3 3 3 3 1
## [1000] 3 1 2 2 3 2 3 2 3 2 3 3 2 3 1 2 3 2 3 3 2 2 3 2 2 3
heart$thal <- as.character(heart$thal) # convert to character
heart$thal[heart$thal %in% c("0", "1")] <- "0/1" # merge categories
heart$thal <- factor(heart$thal) # convert back to factor


# Convert categorical variables to factors
heart$sex <- as.factor(heart$sex)
heart$cp <- as.factor(heart$cp)
heart$fbs <- as.factor(heart$fbs)
heart$restecg <- as.factor(heart$restecg)
heart$exang <- as.factor(heart$exang)
heart$slope <- as.factor(heart$slope)
heart$ca <- as.factor(heart$ca)
heart$thal <- as.factor(heart$thal)

# Create contingency tables
sex_table <- table(heart$sex, heart$target)
cp_table <- table(heart$cp, heart$target)
fbs_table <- table(heart$fbs, heart$target)
restecg_table <- table(heart$restecg, heart$target)
exang_table <- table(heart$exang, heart$target)
slope_table <- table(heart$slope, heart$target)
ca_table <- table(heart$ca, heart$target)
thal_table <- table(heart$thal, heart$target)

# Perform chi-squared test on each table
sex_chi <- chisq.test(sex_table)
cp_chi <- chisq.test(cp_table)
fbs_chi <- chisq.test(fbs_table)
restecg_chi <- chisq.test(restecg_table)
exang_chi <- chisq.test(exang_table)
slope_chi <- chisq.test(slope_table)
ca_chi <- chisq.test(ca_table)
thal_chi <- chisq.test(thal_table)


# Print p-values for each variable
print(paste0("Sex p-value: ", sex_chi$p.value))
## [1] "Sex p-value: 6.65682068172645e-19"
print(paste0("Chest pain type p-value: ", cp_chi$p.value))
## [1] "Chest pain type p-value: 1.29806646948206e-60"
print(paste0("Fasting blood sugar > 120 mg/dl p-value: ", fbs_chi$p.value))
## [1] "Fasting blood sugar > 120 mg/dl p-value: 0.218624131028943"
print(paste0("Resting electrocardiographic results p-value: ", restecg_chi$p.value))
## [1] "Resting electrocardiographic results p-value: 1.69642510038776e-08"
print(paste0("Exercise induced angina p-value: ", exang_chi$p.value))
## [1] "Exercise induced angina p-value: 2.82663742966344e-44"
print(paste0("ST segment slope p-value: ", slope_chi$p.value))
## [1] "ST segment slope p-value: 1.42108523925458e-34"
print(paste0("Number of major vessels colored by flourosopy p-value: ", ca_chi$p.value))
## [1] "Number of major vessels colored by flourosopy p-value: 1.74701345104618e-54"
print(paste0("Thalassemia p-value: ", thal_chi$p.value))
## [1] "Thalassemia p-value: 1.52159810768481e-61"
# Create contingency tables for each feature
heart_cat <- heart[, c("sex", "cp", "fbs", "restecg", "exang", "slope", "ca", "thal")]
cont_tables <- lapply(heart_cat, function(x) table(x, heart$target))
cont_tables
## $sex
##    
## x     0   1
##   0  86 226
##   1 413 300
## 
## $cp
##    
## x     0   1
##   0 375 122
##   1  33 134
##   2  65 219
##   3  26  51
## 
## $fbs
##    
## x     0   1
##   0 417 455
##   1  82  71
## 
## $restecg
##    
## x     0   1
##   0 283 214
##   1 204 309
##   2  12   3
## 
## $exang
##    
## x     0   1
##   0 225 455
##   1 274  71
## 
## $slope
##    
## x     0   1
##   0  46  28
##   1 324 158
##   2 129 340
## 
## $ca
##    
## x     0   1
##   0 163 415
##   1 160  66
##   2 113  21
##   3  60   9
##   4   3  15
## 
## $thal
##      
## x       0   1
##   0/1  47  24
##   2   132 412
##   3   320  90
# Perform chi-squared tests for each contingency table
chi_results <- lapply(cont_tables, function(x) chisq.test(x))
chi_results
## $sex
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  x
## X-squared = 78.863, df = 1, p-value < 2.2e-16
## 
## 
## $cp
## 
##  Pearson's Chi-squared test
## 
## data:  x
## X-squared = 280.98, df = 3, p-value < 2.2e-16
## 
## 
## $fbs
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  x
## X-squared = 1.5134, df = 1, p-value = 0.2186
## 
## 
## $restecg
## 
##  Pearson's Chi-squared test
## 
## data:  x
## X-squared = 35.784, df = 2, p-value = 1.696e-08
## 
## 
## $exang
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  x
## X-squared = 194.82, df = 1, p-value < 2.2e-16
## 
## 
## $slope
## 
##  Pearson's Chi-squared test
## 
## data:  x
## X-squared = 155.87, df = 2, p-value < 2.2e-16
## 
## 
## $ca
## 
##  Pearson's Chi-squared test
## 
## data:  x
## X-squared = 257.29, df = 4, p-value < 2.2e-16
## 
## 
## $thal
## 
##  Pearson's Chi-squared test
## 
## data:  x
## X-squared = 280.08, df = 2, p-value < 2.2e-16
# Extract p-values from each test
p_values <- sapply(chi_results, function(x) x$p.value)
p_values
##          sex           cp          fbs      restecg        exang        slope 
## 6.656821e-19 1.298066e-60 2.186241e-01 1.696425e-08 2.826637e-44 1.421085e-34 
##           ca         thal 
## 1.747013e-54 1.521598e-61
# Choose a significance threshold
sig_threshold <- 0.05

# Select significant features based on p-values
sig_features <- names(p_values[p_values < sig_threshold])

# Print significant features
print(sig_features)
## [1] "sex"     "cp"      "restecg" "exang"   "slope"   "ca"      "thal"
# So, we got the significant features:
# "sex", "cp", "restecg", "exang", "slope", "ca", "thal"   



# chi square plot
plot(qchisq((1:nrow(heart_x) - 1/2) / nrow(heart_x), df = 3), sort(d),
     xlab = expression(paste(chi[3]^2, " Quantile")),
     ylab = "Ordered distances")
abline(a = 0, b = 1)

# From the chi square plot, we see that the distances appear to be roughly linearly related 
# to the quantiles of the chi-squared distribution, and the points fall close to the diagonal line. 
# This suggests that the Mahalanobis distance metric may be appropriate for this dataset. 
# However, further analysis and interpretation may be needed to draw more definitive conclusions.

# Correlations
pairs(heart_x)

# Now, let us see the distribution of Heart Disease

# First, we will be looking at how many data points do we have where the client has or does not 
# have heart disease. We would like to make sure that the dataset is has a balance of 
# people with and without heart disease in order to create an accurate model.
heart %>%
  count(target)
## # A tibble: 2 × 2
##   target     n
##    <dbl> <int>
## 1      0   499
## 2      1   526
# As we can see from the result the dataset is also balanced as we have 526 datapoints 
# which dont have heart disease and 499 data points with heart disease. 
# While the dataset might seem balanced on the surface when we look further we will see that 
# the dataset is not balanced based on gender

# 1) Hypothesis 1
# Sex: Studies have shown that men are more likely to develop heart disease than women. 
# In the Heart Disease dataset, we can use a contingency table and chi-square test to 
# investigate the association between sex and the presence of heart disease.

# Which population is diagnosed more with a heart disease? Male Population or Female population?

# The signs of a woman having a heart attack are much less noticeable than the signs of a male. 
# In women, heart attacks may feel uncomfortable squeezing, pressure, fullness, or pain in the 
# center of the chest. It may also cause pain in one or both arms, the back, neck, jaw or stomach, 
# shortness of breath, nausea and other symptoms. Men experience typical symptoms of heart attack, 
# such as chest pain , discomfort, and stress. They may also experience pain in other areas, 
# such as arms, neck , back, and jaw, and shortness of breath, sweating, and discomfort that 
# mimics heartburn.

# source: healthblog.uofmhealth

# The proportion of females and males in the dataset.

heart %>% 
  group_by(sex) %>% 
  summarise(percent = 100 * n() / nrow(heart))
## # A tibble: 2 × 2
##   sex   percent
##   <fct>   <dbl>
## 1 0        30.4
## 2 1        69.6
# sex   percent
# <fct>   <dbl>
#  0       30.4
#  1       69.6

# There are 30.4 % females and 69.6% males in the dataset


# Distribution of Heart Disease among males and females
# Create a contingency table of sex and target (heart disease)

Sub_female <- table(heart[heart$sex==0,]$target)
Sub_male <- table(heart[heart$sex==1,]$target)
FM_combine <- rbind(Sub_female,Sub_male)

#Rename columns names and rows names.
colnames(FM_combine) <- c("Has heart disease", "Does not have heart disease")
rownames(FM_combine) <- c("Females", "Males")

#Display the table
FM_combine
##         Has heart disease Does not have heart disease
## Females                86                         226
## Males                 413                         300
# There are 86 females out of 312 who have diagnosed with heart disease and 413 males out of 713 
# were diagnosed with heart disease.
# This indicates that 58% of males in this dataset are diagnosed with heart disease 
# where is only 27% of females are diagnosed with heart disease.


# Perform a chi-square test on the contingency table
chi_test <- chisq.test(FM_combine)
chi_test
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  FM_combine
## X-squared = 78.863, df = 1, p-value < 2.2e-16
# The output from performing Pearson's chi-squared test with Yates' continuity correction 
# on a contingency table:

# Pearson's Chi-squared test with Yates' continuity correction
# data:  cont_table
# X-squared = 78.863, df = 1, p-value < 2.2e-16


# The chi-squared test tests the null hypothesis that there is no association between the two 
# variables, and the alternative hypothesis that there is a significant association.
# The test statistic X-squared is 78.863 with 1 degree of freedom (df) and a p-value less than 
# 2.2e-16, which is smaller than any significance level commonly used (0.05).
# This indicates strong evidence against the null hypothesis and suggests that there is a 
# significant association between the "sex" and "target" variables in the Heart Disease dataset. 
# Specifically, the test suggests that men are more likely to have heart disease than women 
# since the contingency table shows that a higher proportion of men have heart disease compared 
# to women, and so we can infer that men are more likely to have heart disease than women.



# This is a mosaic plot, helps to visualize the statistical association between two variables.
library(mosaic)
## Registered S3 method overwritten by 'mosaic':
##   method                           from   
##   fortify.SpatialPolygonsDataFrame ggplot2
## 
## The 'mosaic' package masks several functions from core packages in order to add 
## additional features.  The original behavior of these functions should not be affected by this.
## 
## Attaching package: 'mosaic'
## 
## The following object is masked from 'package:Matrix':
## 
##     mean
## 
## The following objects are masked from 'package:pROC':
## 
##     cov, var
## 
## The following objects are masked from 'package:infer':
## 
##     prop_test, t_test
## 
## The following object is masked from 'package:scales':
## 
##     rescale
## 
## The following object is masked from 'package:purrr':
## 
##     cross
## 
## The following object is masked from 'package:ggplot2':
## 
##     stat
## 
## The following objects are masked from 'package:dplyr':
## 
##     count, do, tally
## 
## The following objects are masked from 'package:stats':
## 
##     binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
##     quantile, sd, t.test, var
## 
## The following objects are masked from 'package:base':
## 
##     max, mean, min, prod, range, sample, sum
mosaicplot(heart$sex ~ heart$target,
           main="Heart disease outcome by Gender", shade=FALSE,color=TRUE,
           xlab="Gender", ylab="Heart disease")

# Ans:  We can conclude that males are diagnosed more with a heart disease than females



# 2) Hypothesis 
# Age: As age increases, the likelihood of developing heart disease also increases. 
# This is supported by previous studies on cardiovascular disease. In the Heart Disease dataset, 
# we can use a scatter plot to visualize the relationship between age and the presence of heart 
# disease, and a correlation test to measure the strength of the relationship.

ggplot(heart, aes(x = age, y = factor(target), color = target)) +
  geom_point() +
  labs(x = "Age", y = "Presence of Heart Disease", color = "Heart Disease")

# heart$target <- as.numeric(heart$target)
cor.test(heart$age, heart$target, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = -7.5356, df = 1023, p-value = 1.068e-13
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2865322 -0.1704854
## sample estimates:
##        cor 
## -0.2293236
# true correlation is not equal to 0 means that there is a significant linear relationship 
# between the two variables being tested.

# Pearson's product-moment correlation
# 
# data:  heart$age and heart$target
# t = -7.5356, df = 1023, p-value = 1.068e-13
# alternative hypothesis: true correlation is not equal to 0
# 95 percent confidence interval:
#  -0.2865322 -0.1704854
# sample estimates:
#        cor 
# -0.2293236 


# If the p-value is less than the chosen significance level (0.05), we can conclude 
# that there is a significant correlation between age and the presence of heart disease.

# The test statistic is the t-value, which in this case is -7.5356. 
# The degrees of freedom (df) are 1023, and the p-value is 1.068e-13, which is very small 
# and indicates that there is a significant correlation between age and target.

# "alternative hypothesis: true correlation is not equal to 0": 
  # This line specifies the alternative hypothesis, which is that there is a non-zero 
  # correlation between age and target.

# "95 percent confidence interval: -0.2865322 -0.1704854": 
  # This line provides the 95% confidence interval for the correlation coefficient. 
  # In this case, it ranges from -0.2865322 to -0.1704854.

# "sample estimates: cor -0.2293236": 
  # This line provides the sample estimate of the correlation coefficient, which is -0.2293236. 
  # This indicates a moderate negative correlation between age and target.

# Based on the output of the Pearson's correlation test, we can conclude that there is a 
# statistically significant negative correlation between age and the presence of heart disease 
# (target variable) in the Heart Disease dataset.
# The correlation coefficient is -0.2293, which indicates a moderate negative correlation 
# between the two variables. As age increases, the likelihood of having heart disease decreases. 
# However, it's important to note that correlation does not necessarily imply causation, 
# and further analysis would be needed to establish causality.

# Let us create a A faceted histogram which shows the relationship between heart disease and age 

age.plot <- ggplot(heart, mapping = aes(x = age, fill = target)) +
  geom_histogram() +
  facet_wrap(vars(target)) +
  labs(title = "Prevelance of Heart Disease Across Age", x = "Age (years)", y = "Count", fill = "Heart Disease")

age.plot
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# The histograms for age faceted on the presence and absence of heart disease have 
# different distribution shapes, suggesting that age does have a relationship with heart disease. 
# The distribution of the presence of heart disease is left skewed while the distribution 
# of the absence of heart disease appears more normally distributed. 
# These graphics suggests that there are more older people with heart disease than younger 
# people with heart disease.

# This is a boxplot to displays the age distribution of heart diagnosis.
boxplot(heart$age ~ heart$target,
        main="Heart disease diagnosis distribution by Age",
        ylab="Age",xlab="Heart disease diagnosed")

# Ans: We can conclude that people with a higher age are more likely diagnised with a heart disease



# 3) Hypothesis 3
# Chest pain type: The type of chest pain a patient experiences can provide important 
# diagnostic information about heart disease. 
# cp: chest pain type
# — Value 0: asymptomatic
# — Value 1: atypical angina
# — Value 2: non-anginal pain
# — Value 3: typical angina


#The proportion of patients with chest pain types.

heart %>% group_by(cp) %>% 
  summarise( percent = 100 * n() / nrow( heart ))
## # A tibble: 4 × 2
##   cp    percent
##   <fct>   <dbl>
## 1 0       48.5 
## 2 1       16.3 
## 3 2       27.7 
## 4 3        7.51
#  cp    percent
# <fct>   <dbl>
#  0       48.5 
#  1       16.3 
#  2       27.7 
#  3       7.51

# There are 7% of patient with typical angina chest pain, 16 % of patients with atypical angina, 
# 27% with non-angina pain, and 48% with asymptomatic chest pain.


# In the Heart Disease dataset, we can use a box plot to visualize the distribution of chest 
# pain type among patients with and without heart disease, and a t-test to compare the 
# means of the two groups.

# create a box plot to visualize the distribution of chest pain type

ggplot(heart, aes(x = target, y = factor(cp))) + 
  geom_boxplot() +
  xlab("Presence of Heart Disease") +
  ylab("Chest Pain Type")

# The boxplot suggests that chest pain type may be a significant predictor of heart disease in 
# patients.


# This plot to visualize the Heart disease diagnosis Distributions by Chest pain. 
# There are four types of chest pain:(0) asymptomatic, (1) atypical angina, (2) non-anginal pain, 
# and (3) typical angina.

ggplot(data = heart, aes(x = target, fill = cp)) + 
  geom_bar(position = "fill") +
  labs(title = "Heart disease diagnosis Distributions by Chest pain",
       x = "Heart disease diagnosis",
       y = "chest pain") +
  theme_test()

str(heart)
## spc_tbl_ [1,025 × 15] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ age     : num [1:1025] 52 53 70 61 62 58 58 55 46 54 ...
##  $ sex     : Factor w/ 2 levels "0","1": 2 2 2 2 1 1 2 2 2 2 ...
##  $ cp      : Factor w/ 4 levels "0","1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
##  $ trestbps: num [1:1025] 125 140 145 148 138 100 114 160 120 122 ...
##  $ chol    : num [1:1025] 212 203 174 203 294 248 318 289 249 286 ...
##  $ fbs     : Factor w/ 2 levels "0","1": 1 2 1 1 2 1 1 1 1 1 ...
##  $ restecg : Factor w/ 3 levels "0","1","2": 2 1 2 2 2 1 3 1 1 1 ...
##  $ thalach : num [1:1025] 168 155 125 161 106 122 140 145 144 116 ...
##  $ exang   : Factor w/ 2 levels "0","1": 1 2 2 1 1 1 1 2 1 2 ...
##  $ oldpeak : num [1:1025] 1 3.1 2.6 0 1.9 1 4.4 0.8 0.8 3.2 ...
##  $ slope   : Factor w/ 3 levels "0","1","2": 3 1 1 3 2 2 1 2 3 2 ...
##  $ ca      : Factor w/ 5 levels "0","1","2","3",..: 3 1 1 2 4 1 4 2 1 3 ...
##  $ thal    : Factor w/ 3 levels "0/1","2","3": 3 3 3 3 2 2 1 3 3 2 ...
##  $ target  : num [1:1025] 0 0 0 0 0 1 0 0 0 0 ...
##  $ pvalues : num [1:1025] 0.055856 0.000377 0.001197 0.024796 0.000302 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   age = col_double(),
##   ..   sex = col_double(),
##   ..   cp = col_double(),
##   ..   trestbps = col_double(),
##   ..   chol = col_double(),
##   ..   fbs = col_double(),
##   ..   restecg = col_double(),
##   ..   thalach = col_double(),
##   ..   exang = col_double(),
##   ..   oldpeak = col_double(),
##   ..   slope = col_double(),
##   ..   ca = col_double(),
##   ..   thal = col_double(),
##   ..   target = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
heart$target <- as.numeric(heart$target)
heart$cp <- as.numeric(heart$cp)

# perform a t-test to compare the means of chest pain type between patients with and without heart disease
t.test(cp ~ target, data = heart)
## 
##  Welch Two Sample t-test
## 
## data:  cp by target
## t = -15.462, df = 1022.9, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -1.0089917 -0.7817304
## sample estimates:
## mean in group 0 mean in group 1 
##        1.482966        2.378327
# Welch Two Sample t-test
# 
# data:  cp by target
# t = -15.462, df = 1022.9, p-value < 2.2e-16
# alternative hypothesis: true difference in means between group 1 and group 2 is not equal to 0
# 95 percent confidence interval:
#   -1.0089917 -0.7817304
# sample estimates:
#   mean in group 1 mean in group 2 
# 1.482966        2.378327 

# This is the output of a Welch two-sample t-test which is used to test if there is a significant 
# difference between the means of two groups. In this case, the groups are defined by the presence 
# or absence of heart disease (target variable) and the chest pain type (cp variable).

# The output shows the t-value, degrees of freedom (df), and the p-value. 
# The null hypothesis is that the means of the two groups are equal, 
# and the alternative hypothesis is that they are not equal. 
# A p-value less than the significance level (typically 0.05) indicates that there is 
# evidence to reject the null hypothesis.
# In this case, the p-value is extremely small (less than 2.2e-16), 
# so we can conclude that there is a significant difference between the means of the two groups. 
# The 95% confidence interval shows the range within which we can be 95% confident that the 
# true difference between the means of the two groups lies. 
# The sample estimates show the mean value of chest pain type (cp variable) for the two groups, 
# with group 0 indicating those without heart disease and group 1 indicating those with 
# heart disease.


# A a bar chart is shown for the relationship between heart disease and chest pain type. 

cp.plot <- ggplot(heart, mapping = aes(x=target, fill = factor(cp))) +
  geom_bar(position = "dodge") +
  labs(title = "Prevelance of Heart Disease for Different Chest Pain Types", x = "Heart Disease", y = "Count", fill = "Chest Pain Type")

cp.plot

# There does appear to be a relationship between type of chest pain and heart disease. 
# Interestingly, asymptomatic chest pain type (Value 0) has the highest count for the presence of 
# heart disease, while typical angina pain has the lowest count. There is a higher count 
# of people without heart disease that have atypical or typical angina chest pain compared 
# to people with heart disease. Angina is listed as one of the most common symptoms of 
# heart attack and so this result is skeptical and needs further investigation, but we will 
# assume it is correct for the current analysis.



# Ans: Therefore, chest pain type is a significant predictor of heart disease in patients.




attach(heart)


# Visualizations of each variable are created in order to familiarize ourselves with 
# the predictors. 
# Proportional bar charts are created for our 3 binary variables: sex, fasting blood sugar, 
# and exercise induced angina. Box plots are shown for our 3 numerical variables: 
#   resting blood pressure, serum cholesterol, and maximum heart rate.


heart$target <- as.factor(heart$target)

# sex: The person’s sex (1 = male, 0 = female)
sex.plot <- ggplot(heart, mapping = aes(x = sex, fill = target)) +
  geom_bar(position = "fill") +
  labs(x = "Sex", y = "Proportion", fill = "Heart Disease") +
  theme(axis.text.x = element_text(size = 12), axis.title.x = element_text(size = 12), 
        axis.title.y = element_text(size = 12), axis.text.y = element_text(size = 12))

# fbs: The person’s fasting blood sugar (> 120 mg/dl, 1 = true; 0 = false)
fbs.plot <- ggplot(heart, mapping = aes(x=fbs, fill=target)) +
  geom_bar(position = "fill") +
  labs(x = "Fasting Blood Sugar", y = "Proportion", fill = "Heart Disease") +
  scale_x_discrete(labels = c("low", "high"))+
  theme(axis.text.x = element_text(size = 12), axis.title.x = element_text(size = 12), 
        axis.title.y = element_text(size = 12), axis.text.y = element_text(size = 12))

# exang: Exercise induced angina (1 = yes; 0 = no)
exang.plot <- ggplot(heart, mapping = aes(x = exang, fill = target)) +
  geom_bar(position = "fill") +
  labs(x = "Exercise induced angina", y = "Proportion", fill = "Heart Disease") +
  theme(axis.text.x = element_text(size = 12), axis.title.x = element_text(size = 12))


grid.arrange(sex.plot, fbs.plot, exang.plot, nrow=2)

# The bar plot on the top left shows a higher proportion of males with heart disease than 
# females with heart disease, suggesting that there is a relationship between sex and heart disease. 
# There is an even larger distinction for heart disease as it relates to exercise induced angina, 
# for a much higher proportion of people with exercise induced angina have heart disease compared 
# to people without exercise induced angina (bottom left plot). 
# Fasting blood sugar levels do not appear to have a correlation with heart disease, as there 
# appears to be a similar proportion of presence and absence of heart disease for people with 
# high and low fasting blood sugar levels (top right plot).

# More visualizations

# trestbps: The person’s resting blood pressure (mm Hg on admission to the hospital)
trestbps.plot <- ggplot(heart, mapping = aes(x=trestbps, y=target)) +
  geom_boxplot() +
  labs(x = "Resting Blood Pressure (mm Hg)", y = "Heart Disease") +
  theme(axis.text.x = element_text(size = 12), axis.title.x = element_text(size = 12), 
        axis.title.y = element_text(size = 12), axis.text.y = element_text(size = 12))

# chol: The person’s cholesterol measurement in mg/dl
chol.plot <- ggplot(heart, mapping = aes(x=chol, y=target)) +
  geom_boxplot() +
  labs(x = "Serum Cholestoral (mg/dl)", y = "Heart Disease") +
  theme(axis.text.x = element_text(size = 12), axis.title.x = element_text(size = 12), 
        axis.title.y = element_text(size = 12), axis.text.y = element_text(size = 12))

# thalach: The person’s maximum heart rate achieved
maxhr.plot <- ggplot(heart, mapping = aes(x = thalach, y = target)) +
  geom_boxplot() +
  labs(x = "Maximum Heart Rate (bpm)", y = "Heart Disease") +
  theme(axis.text.x = element_text(size = 12), axis.title.x = element_text(size = 12), 
        axis.title.y = element_text(size = 12), axis.text.y = element_text(size = 12))

grid.arrange(trestbps.plot, chol.plot, maxhr.plot, nrow=2)

# The box plots for resting blood pressure (top left) appear similar for the presence and 
# absence of heart disease; the boxes (middle 50% of data) overlap and have similar centers 
# and boundaries. These similar distributions suggest that there is likely not a strong 
# relationship between resting blood pressure and heart disease. 
# No strong relationship between serum cholesterol and heart disease is visible either, 
# for these distributions are close in center and spread (top right). 
# However, the last box plot does show a relationship between maximum heart rate achieved 
# during exercise and heart disease. The trend of maximum heart rate appears to be higher for 
# the absence of heart disease compared to the presence of heart disease, and there is 
# little overlap between the two boxes.

# And, to visualize the relationship between restecg and target, we can create a stacked bar plot. 
# This plot shows the count of each value of restecg for each value of target.

# restecg: resting electrocardiographic results
#   Value 0: showing probable or definite left ventricular hypertrophy by Estes’ criteria
#   Value 1: normal
#   Value 2: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
ggplot(heart, aes(x = target, fill = factor(restecg))) +
  geom_bar()

# From the above plot, we can infer that restecg value 0 and value 2 is found more in common with 
# people having a heart disease. 
# So, restecg is also a important feature.


# From the above observations, we got the significant features which is correct:
# "sex", "cp", "restecg", "exang", "slope", "ca", "thal"   



# More useful visualizations 


# Another plot to visualize heart disease diagnosis Distributions by Number of major vessels.
# ca: The number of major vessels (0–3)

ggplot(data = heart, aes(x = target, fill = ca)) + 
  geom_bar(position = "fill") +
  labs(title = "Heart disease diagnosis Distributions by Number of major vessels ",
       x = "Heart disease diagnosis",
       y = "thal") +
  theme_test()

# Here is the scatterplot showing the relationship between age and maximum heart rate achieved 
# in the heart dataset:
  
ggplot(heart, aes(x=age, y=thalach, color=target)) + 
  geom_point(alpha=0.5) +
  labs(x="Age", y="Max heart rate", color="Target") +
  theme_minimal()

# The downward trend indicates that as age increases, maximum heart rate achieved tends to decrease. 
# This relationship appears to be somewhat linear, although there is some variability in the data.


# Histogram of patient’s age and gender

meanAge <- data.frame(sex = c(0, 1), age = c(mean(heart[heart$sex==0,]$age),mean(heart[heart$sex==1,]$age)))

#ggplot of age of the patients categorized by sex
Plot <- ggplot(heart, aes(x=age, fill=as.factor(sex))) +
  geom_histogram(alpha=0.5, position="identity")+ 
  geom_vline(aes(xintercept = age), meanAge)+
  facet_wrap(~as.factor(sex))+
  labs(title="Histogram of patients's age by gender", 
       x="Age of patients", y="Count", fill="Sex")+
  geom_text(meanAge, mapping=aes(x=age, y=8.5, label=paste("Mean=", signif(age,4))),
            size=4, angle=90, vjust=-0.4, hjust=0)+
  scale_fill_discrete(breaks=c("0", "1"),
                      labels=c("0 - Female", "1 - Male"))

#display the Plot
Plot
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Heart disease diagnosis frequency by Resting electrocardiographic results and sex
heart$restecg <- as.factor(heart$restecg)
heart %>%
  ggplot(aes(x = target, fill=restecg)) + 
  geom_bar(position = "dodge") +
  facet_grid(~sex) +
  scale_fill_brewer(palette = "Dark2") +
  labs(title="Heart disease diagnosis frequency by restecg and sex")

heart$age <- as.numeric(heart$age)
heart$sex <- as.numeric(heart$sex)
heart$cp <- as.numeric(heart$cp)
heart$trestbps <- as.numeric(heart$trestbps)
heart$chol <- as.numeric(heart$chol)
heart$fbs <- as.numeric(heart$fbs)
heart$restecg <- as.numeric(heart$restecg)
heart$thalach <- as.numeric(heart$thalach)
heart$exang <- as.numeric(heart$exang)
heart$oldpeak <- as.numeric(heart$oldpeak)
heart$slope <- as.numeric(heart$slope)
heart$ca <- as.numeric(heart$ca)
heart$thal <- as.numeric(heart$thal)

correlations <- cor(heart[,1:13])
corrplot(correlations, method="circle")

# A dot-representation was used where blue represents positive correlation and red negative. 
# The larger the dot the larger the correlation.

# -> As age increases, cholesterol, stress of heart during exercise and resting bp also increase. 
#    On the other hand, maximum heart rate falls with old age.
# -> As cholesterol increases, stress of heart during exercise and resting bp increase, 
     # while maximum heart rate falls.
# -> As ST depression rises, i.e. stress of the heart falls, resting bp rises.
# -> Resting bp also has a negative relation with maximum heart rate.
# -> The degree of correlation is very small between all variables. However, age and maximum 
     # heart rate show a slightly higher correlation.
# -> St depression and maximum heart rate also show similar results.




# Feature Importance
# There are different ways to identify the important features in the data.
# 
# 1- Correlation
# 
# 2- Random Forest: Gini Importance or Mean Decrease in Impurity (MDI) calculates each feature 
# importance as the sum over the number of splits (across all tress) that include the feature, 
# proportionally to the number of samples it splits.

corelations = data.frame(cor(heart[,1:13], use = "complete.obs"))
corelations
##                  age         sex          cp    trestbps        chol
## age       1.00000000 -0.10324030 -0.07196627  0.27112141  0.21982253
## sex      -0.10324030  1.00000000 -0.04111909 -0.07897377 -0.19825787
## cp       -0.07196627 -0.04111909  1.00000000  0.03817742 -0.08164102
## trestbps  0.27112141 -0.07897377  0.03817742  1.00000000  0.12797743
## chol      0.21982253 -0.19825787 -0.08164102  0.12797743  1.00000000
## fbs       0.12124348  0.02720046  0.07929359  0.18176662  0.02691716
## restecg  -0.13269617 -0.05511721  0.04358061 -0.12379409 -0.14741024
## thalach  -0.39022708 -0.04936524  0.30683928 -0.03926407 -0.02177209
## exang     0.08816338  0.13915681 -0.40151271  0.06119697  0.06738223
## oldpeak   0.20813668  0.08468656 -0.17473348  0.18743411  0.06488031
## slope    -0.16910511 -0.02666629  0.13163278 -0.12044531 -0.01424787
## ca        0.27155053  0.11172891 -0.17620647  0.10455372  0.07425934
## thal      0.07224456  0.20211721 -0.16985403  0.05894881  0.09552541
##                   fbs     restecg      thalach       exang     oldpeak
## age       0.121243479 -0.13269617 -0.390227075  0.08816338  0.20813668
## sex       0.027200461 -0.05511721 -0.049365243  0.13915681  0.08468656
## cp        0.079293586  0.04358061  0.306839282 -0.40151271 -0.17473348
## trestbps  0.181766624 -0.12379409 -0.039264069  0.06119697  0.18743411
## chol      0.026917164 -0.14741024 -0.021772091  0.06738223  0.06488031
## fbs       1.000000000 -0.10405124 -0.008865857  0.04926057  0.01085948
## restecg  -0.104051244  1.00000000  0.048410637 -0.06560553 -0.05011425
## thalach  -0.008865857  0.04841064  1.000000000 -0.38028087 -0.34979616
## exang     0.049260570 -0.06560553 -0.380280872  1.00000000  0.31084376
## oldpeak   0.010859481 -0.05011425 -0.349796163  0.31084376  1.00000000
## slope    -0.061902374  0.08608609  0.395307843 -0.26733547 -0.57518854
## ca        0.137156259 -0.07807235 -0.207888416  0.10784854  0.22181603
## thal     -0.030129129 -0.02030400 -0.106701873  0.20958207  0.20473483
##                slope          ca        thal
## age      -0.16910511  0.27155053  0.07224456
## sex      -0.02666629  0.11172891  0.20211721
## cp        0.13163278 -0.17620647 -0.16985403
## trestbps -0.12044531  0.10455372  0.05894881
## chol     -0.01424787  0.07425934  0.09552541
## fbs      -0.06190237  0.13715626 -0.03012913
## restecg   0.08608609 -0.07807235 -0.02030400
## thalach   0.39530784 -0.20788842 -0.10670187
## exang    -0.26733547  0.10784854  0.20958207
## oldpeak  -0.57518854  0.22181603  0.20473483
## slope     1.00000000 -0.07344041 -0.09650143
## ca       -0.07344041  1.00000000  0.14576170
## thal     -0.09650143  0.14576170  1.00000000
# Splitting the data for females and males in order to find the most important factor 
# leading to heart disease in each gender.


#Create a subset for males only.
Male_Data <- subset(heart, sex==1)
Male_Data
## # A tibble: 312 × 15
##      age   sex    cp trestbps  chol   fbs restecg thalach exang oldpeak slope
##    <dbl> <dbl> <dbl>    <dbl> <dbl> <dbl>   <dbl>   <dbl> <dbl>   <dbl> <dbl>
##  1    62     1     1      138   294     2       2     106     1     1.9     2
##  2    58     1     1      100   248     1       1     122     1     1       2
##  3    71     1     1      112   149     1       2     125     1     1.6     2
##  4    43     1     1      132   341     2       1     136     2     3       2
##  5    34     1     2      118   210     1       2     192     1     0.7     3
##  6    34     1     2      118   210     1       2     192     1     0.7     3
##  7    51     1     3      140   308     1       1     142     1     1.5     3
##  8    50     1     2      120   244     1       2     162     1     1.1     3
##  9    67     1     1      106   223     1       2     142     1     0.3     3
## 10    63     1     3      135   252     1       1     172     1     0       3
## # … with 302 more rows, and 4 more variables: ca <dbl>, thal <dbl>,
## #   target <fct>, pvalues <dbl>
#Create another subset for females only.
Female_Data <- subset(heart, sex != 1)
Female_Data
## # A tibble: 713 × 15
##      age   sex    cp trestbps  chol   fbs restecg thalach exang oldpeak slope
##    <dbl> <dbl> <dbl>    <dbl> <dbl> <dbl>   <dbl>   <dbl> <dbl>   <dbl> <dbl>
##  1    52     2     1      125   212     1       2     168     1     1       3
##  2    53     2     1      140   203     2       1     155     2     3.1     1
##  3    70     2     1      145   174     1       2     125     2     2.6     1
##  4    61     2     1      148   203     1       2     161     1     0       3
##  5    58     2     1      114   318     1       3     140     1     4.4     1
##  6    55     2     1      160   289     1       1     145     2     0.8     2
##  7    46     2     1      120   249     1       1     144     1     0.8     3
##  8    54     2     1      122   286     1       1     116     2     3.2     2
##  9    51     2     1      140   298     1       2     122     2     4.2     2
## 10    52     2     1      128   204     2       2     156     2     1       2
## # … with 703 more rows, and 4 more variables: ca <dbl>, thal <dbl>,
## #   target <fct>, pvalues <dbl>
# converting the target column into a factor of 2 groups
Male_Data$target <- as.factor(Male_Data$target)
Female_Data$target <- as.factor(Female_Data$target)

library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:gridExtra':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
library(caret)
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:mosaic':
## 
##     dotPlot
## 
## The following objects are masked from 'package:yardstick':
## 
##     precision, recall, sensitivity, specificity
## 
## The following object is masked from 'package:purrr':
## 
##     lift
#Feature selection using random forest technique
Feature_Importance_Males = randomForest(target~., data=Male_Data)
# Create an importance based on mean decreasing gini
importance(Feature_Importance_Males)
##          MeanDecreaseGini
## age             11.924009
## sex              0.000000
## cp              12.288118
## trestbps         7.936674
## chol             6.631398
## fbs              1.247778
## restecg          2.722251
## thalach          8.177346
## exang           10.096695
## oldpeak         16.625298
## slope            7.774846
## ca              10.629276
## thal            17.299322
## pvalues          9.647001
varImp(Feature_Importance_Males)
##            Overall
## age      11.924009
## sex       0.000000
## cp       12.288118
## trestbps  7.936674
## chol      6.631398
## fbs       1.247778
## restecg   2.722251
## thalach   8.177346
## exang    10.096695
## oldpeak  16.625298
## slope     7.774846
## ca       10.629276
## thal     17.299322
## pvalues   9.647001
varImpPlot(Feature_Importance_Males, col= "red", pch= 20)

Feature_Importance_Females = randomForest(target~., data=Female_Data)
# Create an importance based on mean decreasing gini
importance(Feature_Importance_Females)
##          MeanDecreaseGini
## age             32.344729
## sex              0.000000
## cp              45.878216
## trestbps        24.185781
## chol            28.093235
## fbs              3.764119
## restecg          5.973994
## thalach         46.295532
## exang           10.672837
## oldpeak         37.299017
## slope           15.019152
## ca              45.620489
## thal            20.668602
## pvalues         27.953655
varImp(Feature_Importance_Females)
##            Overall
## age      32.344729
## sex       0.000000
## cp       45.878216
## trestbps 24.185781
## chol     28.093235
## fbs       3.764119
## restecg   5.973994
## thalach  46.295532
## exang    10.672837
## oldpeak  37.299017
## slope    15.019152
## ca       45.620489
## thal     20.668602
## pvalues  27.953655
varImpPlot(Feature_Importance_Females, col= "red", pch= 20)

# 4) Hypothesis

# H0 = There is no association between chest pain and heart disease diagnosis

# HA = There is association between chest pain and heart disease diagnosis

qqnorm(heart$age)
qqline(heart$age)

# Scatterplot
library(ggplot2)
gg <- ggplot(heart, aes(x=chol, y=trestbps)) + 
  geom_point(aes(col=target, size=oldpeak)) + 
  geom_smooth(method="loess", se=F) + 
  xlim(c(100, 430)) + 
  ylim(c(75, 200)) + 
  labs(subtitle="trestbps Vs chol", 
       y="trestbps", 
       x="chol", 
       title="Scatterplot", 
       caption = "Source: midwest", 
       bins = 30)
plot(gg)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 3 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 3 rows containing missing values (`geom_point()`).

# We will try logistic regression to predict which patients have heart disease.

# Logistic Regression Prediction


library(caTools)
library(MASS)
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
library(ROCR)
library(Information)
## 
## Attaching package: 'Information'
## 
## The following object is masked from 'package:dials':
## 
##     penalty
library(OptimalCutpoints)
library(pROC)
library(caret)
library(ggplot2)
library(lattice)

#Split the Data to training and testing data to conduct a logistic regression model
set.seed(123)
split=sample.split(heart$target, SplitRatio = 0.75)
Train_Data=subset(heart,split == TRUE)
Test_Data=subset(heart,split == FALSE)


# Fit a logistic regression model: We will fit a logistic regression model using the "glm" 
# function in R.

Log_model <- glm(target ~., data=Train_Data, family = "binomial")
summary(Log_model)
## 
## Call:
## glm(formula = target ~ ., family = "binomial", data = Train_Data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4807  -0.3794   0.1204   0.6027   2.7168  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  6.151504   1.921100   3.202 0.001364 ** 
## age         -0.007024   0.014762  -0.476 0.634226    
## sex         -1.770169   0.293189  -6.038 1.56e-09 ***
## cp           0.837972   0.115954   7.227 4.95e-13 ***
## trestbps    -0.020891   0.006786  -3.079 0.002078 ** 
## chol        -0.005026   0.002361  -2.129 0.033240 *  
## fbs         -0.432964   0.340185  -1.273 0.203113    
## restecg      0.451292   0.218007   2.070 0.038444 *  
## thalach      0.026793   0.006557   4.086 4.39e-05 ***
## exang       -1.025174   0.269484  -3.804 0.000142 ***
## oldpeak     -0.599619   0.137488  -4.361 1.29e-05 ***
## slope        0.398782   0.226414   1.761 0.078189 .  
## ca          -0.758476   0.122161  -6.209 5.34e-10 ***
## thal        -1.015182   0.186298  -5.449 5.06e-08 ***
## pvalues     -1.144755   1.780704  -0.643 0.520311    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1064.15  on 767  degrees of freedom
## Residual deviance:  546.33  on 753  degrees of freedom
## AIC: 576.33
## 
## Number of Fisher Scoring iterations: 6
# Model evaluation: We will evaluate the performance of the model using various metrics such as 
# accuracy, sensitivity, specificity, and ROC curve.


# Make predictions on test data
predictions <- predict(Log_model, newdata = Test_Data, type = "response")

# Measure the accuracy of prediction in the test data
# The common practice is to take the probability cutoff as 0.5.
# If the probability of Y is > 0.5, then it can be classified an event (presence of heart disease).
# So if pred is greater than 0.5, it is positive(heart disease =yes) else it is negative

y_pred_num <- ifelse(predictions > 0.5, 1, 0)
y_pred <- factor(y_pred_num, levels=c(0, 1))
y_act <- Test_Data$target

# Result : Prediction Accuracy (Proportion of predicted target that matches with actual target)
mean(y_pred == y_act)
## [1] 0.8677043
# 0.8754864

# Calculate the optimal threshold
pred <- prediction(predictions, Test_Data$target)
perf <- performance(pred, "tpr", "fpr")
thresholds <- perf@alpha.values[[1]]

# Plot the ROC curve
plot(perf)

# Add the optimal threshold point to the ROC curve
abline(a = 0, b = 1)
points(thresholds, thresholds, col = "red", pch = 19)

# Get p-values for all features
summary(Log_model)$coef[, "Pr(>|z|)"]
##  (Intercept)          age          sex           cp     trestbps         chol 
## 1.364422e-03 6.342264e-01 1.563796e-09 4.946596e-13 2.078266e-03 3.324016e-02 
##          fbs      restecg      thalach        exang      oldpeak        slope 
## 2.031131e-01 3.844444e-02 4.388141e-05 1.422598e-04 1.293311e-05 7.818903e-02 
##           ca         thal      pvalues 
## 5.338421e-10 5.058450e-08 5.203109e-01
# Suppose you have a vector of p-values in scientific notation
p_values <- c(601880e-03, 6.525117e-01, 1.709865e-09, 3.400166e-13, 2.303392e-03, 3.193722e-02, 
              2.370040e-01, 4.173028e-02, 4.954128e-05, 1.587542e-04, 1.448314e-05, 8.981884e-02, 
              4.013419e-10, 5.346086e-08 )

# Use format() to convert to normal form
formatted_p_values <- format(p_values, scientific = FALSE)

# View the result
formatted_p_values
##  [1] "601.8799999999999954525" "  0.6525117000000000278"
##  [3] "  0.0000000017098650000" "  0.0000000000003400166"
##  [5] "  0.0023033920000000000" "  0.0319372200000000023"
##  [7] "  0.2370039999999999925" "  0.0417302800000000015"
##  [9] "  0.0000495412800000000" "  0.0001587542000000000"
## [11] "  0.0000144831400000000" "  0.0898188399999999970"
## [13] "  0.0000000004013419000" "  0.0000000534608600000"
# The p-value obtained for the coefficient of the chest pain variable (cp) in the logistic 
# regression model indicates the statistical significance of the association between 
# chest pain and heart disease diagnosis. 
# The p-value for cp is given as "3.40e-13" which is very small (less than 0.05), 
# indicating strong evidence against the null hypothesis that there is no association 
# between chest pain and heart disease diagnosis.

# Therefore, based on the p-value obtained, we can reject the null hypothesis and conclude 
# that there is a significant association between chest pain and heart disease diagnosis.


predictTrain = predict(Log_model, type='response')
#Confusion matrix using threshold of 0.5
table(Train_Data$target, predictTrain>0.5)
##    
##     FALSE TRUE
##   0   300   74
##   1    35  359
#    FALSE TRUE
# 0   301   73
# 1    35  359


# The table shows the number of instances that fall into each combination of actual and predicted classes.
# In this case, there were 301 instances where the actual target was 0 and the model 
# predicted 0 (true negative), 73 instances where the actual target was 0 and the 
# model predicted 1 (false positive), 35 instances where the actual target was 1 and the 
# model predicted 0 (false negative), and 359 instances where the actual target was 1 and 
# the model predicted 1 (true positive).

# True Negative (TN): 301 - The number of instances where the model correctly predicted the 
# absence of heart disease (actual target = 0)
# False Positive (FP): 73 - The number of instances where the model predicted the presence 
# of heart disease (actual target = 0)
# False Negative (FN): 35 - The number of instances where the model predicted the absence 
# of heart disease (actual target = 1)
# True Positive (TP): 359 - The number of instances where the model correctly predicted the 
# presence of heart disease (actual target = 1)


#Calculate the accuracy on the training set
(301+359)/nrow(Train_Data)  # 0.859375
## [1] 0.859375
#Predictions on Test set
predictTest = predict(Log_model, newdata=Test_Data, type='response')
#Confusion matrix using threshold of 0.5
table(Test_Data$target, predictTest>0.5)
##    
##     FALSE TRUE
##   0   103   22
##   1    12  120
# FALSE TRUE
# 0   105   20
# 1    12  120


# Anova test
anova(Log_model, test="Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: target
## 
## Terms added sequentially (first to last)
## 
## 
##          Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                       767    1064.15              
## age       1   40.290       766    1023.86 2.189e-10 ***
## sex       1   80.482       765     943.38 < 2.2e-16 ***
## cp        1  142.059       764     801.32 < 2.2e-16 ***
## trestbps  1   17.529       763     783.79 2.830e-05 ***
## chol      1    5.270       762     778.52   0.02170 *  
## fbs       1    3.061       761     775.46   0.08019 .  
## restecg   1    2.469       760     772.99   0.11610    
## thalach   1   71.812       759     701.18 < 2.2e-16 ***
## exang     1   25.360       758     675.82 4.756e-07 ***
## oldpeak   1   50.334       757     625.49 1.297e-12 ***
## slope     1    0.343       756     625.14   0.55791    
## ca        1   48.249       755     576.90 3.755e-12 ***
## thal      1   30.164       754     546.73 3.969e-08 ***
## pvalues   1    0.400       753     546.33   0.52707    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# ANOVA (Analysis of Variance) test is used to determine whether there are significant differences 
# between the means of two or more groups. In the context of logistic regression, an ANOVA test 
# is used to determine if there is a significant difference in the deviance between two or more 
# nested models.
# 
# In the output, we can see the ANOVA table for the logistic regression model Log_model. 
# The table shows the change in deviance for each variable added to the model, and the 
# p-value associated with that change in deviance. The null hypothesis is that the model 
# without the variable is not significantly different from the model with the variable. 
# The alternative hypothesis is that the model with the variable is significantly better 
# than the model without the variable.
# 
# In this case, we can see that all the variables are statistically significant except 
# for slope, restecg, and fbs (which has a p-value of 0.65). This suggests that the 
# model is a good fit for the data and that all the variables are important in predicting 
# the target variable.


# t-test
# t-test is statistical method used to determine the significant difference between the means of two groups.

# t-test to confirm the association between chest pain and heart disease

ttest_cp <- t.test(heart$cp ~ heart$target, var.equal= TRUE)  
ttest_cp
## 
##  Two Sample t-test
## 
## data:  heart$cp by heart$target
## t = -15.445, df = 1023, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -1.009114 -0.781608
## sample estimates:
## mean in group 0 mean in group 1 
##        1.482966        2.378327
# The output shows the test statistic t, which is -15.445, the degrees of freedom (df) which 
# is 1023, and the p-value which is less than 2.2e-16, indicating strong evidence against 
# the null hypothesis. The null hypothesis is that there is no significant difference in 
# the means of chest pain between individuals with and without heart disease. 
# The alternative hypothesis is that there is a significant difference in the means of 
# chest pain between the two groups.
# 
# The 95% confidence interval is also shown, which is -1.009114 to -0.781608. The sample 
# means for each group are given as well, which are 1.482966 for group 0 (no heart disease) 
# and 2.378327 for group 1 (heart disease).
# 
# Therefore, the results of the t-test indicate a significant difference in the means of 
# chest pain between individuals with and without heart disease. Individuals with heart 
# disease tend to have a higher mean chest pain score than those without heart disease.

# Chi-test
# let's perform chi-squared test of independence to investigate the association between two 
# categorical variables, chest pain (heart$cp) and heart disease diagnosis (heart$target).
CHI_cp <- chisq.test(heart$cp, heart$target) 

# Print the results to see if p<0.05.
print(CHI_cp)
## 
##  Pearson's Chi-squared test
## 
## data:  heart$cp and heart$target
## X-squared = 280.98, df = 3, p-value < 2.2e-16
# The p-value is less than 2.2e-16, which indicates strong evidence against the null hypothesis 
# of independence. Therefore, we can conclude that there is a significant association between 
# chest pain and heart disease diagnosis.



# Calculate the confusion matrix
conf_matrix <- confusionMatrix(y_pred, y_act)
print(conf_matrix)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 103  12
##          1  22 120
##                                           
##                Accuracy : 0.8677          
##                  95% CI : (0.8201, 0.9066)
##     No Information Rate : 0.5136          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.7346          
##                                           
##  Mcnemar's Test P-Value : 0.1227          
##                                           
##             Sensitivity : 0.8240          
##             Specificity : 0.9091          
##          Pos Pred Value : 0.8957          
##          Neg Pred Value : 0.8451          
##              Prevalence : 0.4864          
##          Detection Rate : 0.4008          
##    Detection Prevalence : 0.4475          
##       Balanced Accuracy : 0.8665          
##                                           
##        'Positive' Class : 0               
## 
# Calculate the accuracy
accuracy <- conf_matrix$overall[["Accuracy"]]
print(paste0("Accuracy: ", accuracy))
## [1] "Accuracy: 0.867704280155642"
# Compute metrics
sensitivity <- conf_matrix$byClass["Sensitivity"]
print(sensitivity) #  0.84 
## Sensitivity 
##       0.824
specificity <- conf_matrix$byClass["Specificity"]
print(specificity) # 0.9090909 
## Specificity 
##   0.9090909
# ROC curve
library(pROC)
roc_data <- roc(Test_Data$target, predictions)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_data, col = "blue", print.auc = TRUE, auc.polygon = TRUE, max.auc.polygon = TRUE,
     grid=c(0.1,0.2), grid.col=c("green", "yellow"), lwd=2, main="ROC Curve")
legend("bottomright", legend = paste("AUC =", round(auc(roc_data),2)), col = "blue", lty=1, cex=1.5)

# The logistic regression model has a sensitivity of 0.84, which means that the model 
# correctly identifies 84% of the people who have heart disease. 

# The specificity of the model is 0.909, which means that the model correctly identifies 
# 90.9% of the people who do not have heart disease.

# The ROC curve plot shows the tradeoff between sensitivity and specificity for 
# different probability thresholds of the model. 
# The closer the curve is to the top left corner, the better the model performance. 
# The AUC (Area Under the Curve) value is a measure of the model's overall performance, 
# where an AUC of 1 represents a perfect model and an AUC of 0.5 represents a random classifier. 
# In this case, the AUC is 0.91, which indicates a good performance of the model.

# Overall, the logistic regression model seems to perform well in predicting the 
# presence of heart disease in the test data.




# 5) Hypothesis 5
# Is there a significant difference in the mean cholesterol levels between individuals 
# with and without heart disease?

# chol: The person’s cholesterol measurement in mg/dl

# Two-sample t-test to compare mean cholesterol levels between individuals with and without heart disease
ttest_chol <- t.test(heart$chol ~ heart$target, var.equal=TRUE)

# Print the results
print(ttest_chol)
## 
##  Two Sample t-test
## 
## data:  heart$chol by heart$target
## t = 3.2134, df = 1023, p-value = 0.001353
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##   4.015552 16.611444
## sample estimates:
## mean in group 0 mean in group 1 
##        251.2926        240.9791
# p-value = 0.001353

# Since this value is less than 0.05, we can reject the null hypothesis and conclude that there 
# is a significant difference in the mean cholesterol levels between individuals with and 
# without heart disease.


# Factor Analysis


library(cluster)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(magrittr)
library(NbClust)
library(data.table)
## 
## Attaching package: 'data.table'
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library(dplyr)
library(psych)
## 
## Attaching package: 'psych'
## 
## The following object is masked from 'package:randomForest':
## 
##     outlier
## 
## The following objects are masked from 'package:mosaic':
## 
##     logit, rescale
## 
## The following objects are masked from 'package:scales':
## 
##     alpha, rescale
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
heart_data <- heart
heart_data
## # A tibble: 1,025 × 15
##      age   sex    cp trestbps  chol   fbs restecg thalach exang oldpeak slope
##    <dbl> <dbl> <dbl>    <dbl> <dbl> <dbl>   <dbl>   <dbl> <dbl>   <dbl> <dbl>
##  1    52     2     1      125   212     1       2     168     1     1       3
##  2    53     2     1      140   203     2       1     155     2     3.1     1
##  3    70     2     1      145   174     1       2     125     2     2.6     1
##  4    61     2     1      148   203     1       2     161     1     0       3
##  5    62     1     1      138   294     2       2     106     1     1.9     2
##  6    58     1     1      100   248     1       1     122     1     1       2
##  7    58     2     1      114   318     1       3     140     1     4.4     1
##  8    55     2     1      160   289     1       1     145     2     0.8     2
##  9    46     2     1      120   249     1       1     144     1     0.8     3
## 10    54     2     1      122   286     1       1     116     2     3.2     2
## # … with 1,015 more rows, and 4 more variables: ca <dbl>, thal <dbl>,
## #   target <fct>, pvalues <dbl>
colnames(heart_data)
##  [1] "age"      "sex"      "cp"       "trestbps" "chol"     "fbs"     
##  [7] "restecg"  "thalach"  "exang"    "oldpeak"  "slope"    "ca"      
## [13] "thal"     "target"   "pvalues"
str(heart_data)
## spc_tbl_ [1,025 × 15] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ age     : num [1:1025] 52 53 70 61 62 58 58 55 46 54 ...
##  $ sex     : num [1:1025] 2 2 2 2 1 1 2 2 2 2 ...
##  $ cp      : num [1:1025] 1 1 1 1 1 1 1 1 1 1 ...
##  $ trestbps: num [1:1025] 125 140 145 148 138 100 114 160 120 122 ...
##  $ chol    : num [1:1025] 212 203 174 203 294 248 318 289 249 286 ...
##  $ fbs     : num [1:1025] 1 2 1 1 2 1 1 1 1 1 ...
##  $ restecg : num [1:1025] 2 1 2 2 2 1 3 1 1 1 ...
##  $ thalach : num [1:1025] 168 155 125 161 106 122 140 145 144 116 ...
##  $ exang   : num [1:1025] 1 2 2 1 1 1 1 2 1 2 ...
##  $ oldpeak : num [1:1025] 1 3.1 2.6 0 1.9 1 4.4 0.8 0.8 3.2 ...
##  $ slope   : num [1:1025] 3 1 1 3 2 2 1 2 3 2 ...
##  $ ca      : num [1:1025] 3 1 1 2 4 1 4 2 1 3 ...
##  $ thal    : num [1:1025] 3 3 3 3 2 2 1 3 3 2 ...
##  $ target  : Factor w/ 2 levels "0","1": 1 1 1 1 1 2 1 1 1 1 ...
##  $ pvalues : num [1:1025] 0.055856 0.000377 0.001197 0.024796 0.000302 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   age = col_double(),
##   ..   sex = col_double(),
##   ..   cp = col_double(),
##   ..   trestbps = col_double(),
##   ..   chol = col_double(),
##   ..   fbs = col_double(),
##   ..   restecg = col_double(),
##   ..   thalach = col_double(),
##   ..   exang = col_double(),
##   ..   oldpeak = col_double(),
##   ..   slope = col_double(),
##   ..   ca = col_double(),
##   ..   thal = col_double(),
##   ..   target = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
attach(heart_data)
## The following objects are masked from heart:
## 
##     age, ca, chol, cp, exang, fbs, oldpeak, pvalues, restecg, sex,
##     slope, target, thal, thalach, trestbps
#heart_data[1:6]
heart_data
## # A tibble: 1,025 × 15
##      age   sex    cp trestbps  chol   fbs restecg thalach exang oldpeak slope
##    <dbl> <dbl> <dbl>    <dbl> <dbl> <dbl>   <dbl>   <dbl> <dbl>   <dbl> <dbl>
##  1    52     2     1      125   212     1       2     168     1     1       3
##  2    53     2     1      140   203     2       1     155     2     3.1     1
##  3    70     2     1      145   174     1       2     125     2     2.6     1
##  4    61     2     1      148   203     1       2     161     1     0       3
##  5    62     1     1      138   294     2       2     106     1     1.9     2
##  6    58     1     1      100   248     1       1     122     1     1       2
##  7    58     2     1      114   318     1       3     140     1     4.4     1
##  8    55     2     1      160   289     1       1     145     2     0.8     2
##  9    46     2     1      120   249     1       1     144     1     0.8     3
## 10    54     2     1      122   286     1       1     116     2     3.2     2
## # … with 1,015 more rows, and 4 more variables: ca <dbl>, thal <dbl>,
## #   target <fct>, pvalues <dbl>
heart_data$target <- as.numeric(heart_data$target)

fit.pc <- principal(heart_data, nfactors=4, rotate="varimax")
fit.pc
## Principal Components Analysis
## Call: principal(r = heart_data, nfactors = 4, rotate = "varimax")
## Standardized loadings (pattern matrix) based upon correlation matrix
##            RC1   RC4   RC2   RC3   h2   u2 com
## age      -0.30  0.08  0.41  0.49 0.51 0.49 2.7
## sex       0.06  0.47  0.14 -0.65 0.66 0.34 2.0
## cp        0.24 -0.62  0.27 -0.21 0.56 0.44 2.0
## trestbps -0.15 -0.08  0.54  0.25 0.39 0.61 1.6
## chol      0.10  0.19  0.11  0.73 0.60 0.40 1.2
## fbs       0.01 -0.09  0.68 -0.13 0.49 0.51 1.1
## restecg  -0.08 -0.20 -0.38 -0.19 0.22 0.78 2.2
## thalach   0.67 -0.26 -0.02 -0.15 0.54 0.46 1.4
## exang    -0.46  0.50 -0.04  0.00 0.46 0.54 2.0
## oldpeak  -0.74  0.17  0.12 -0.01 0.59 0.41 1.2
## slope     0.82  0.03 -0.09  0.05 0.68 0.32 1.0
## ca       -0.07  0.45  0.43  0.06 0.39 0.61 2.1
## thal     -0.02  0.61  0.07 -0.05 0.38 0.62 1.0
## target    0.43 -0.70 -0.16  0.00 0.70 0.30 1.8
## pvalues   0.39 -0.12 -0.54  0.07 0.47 0.53 2.0
## 
##                        RC1  RC4  RC2  RC3
## SS loadings           2.41 2.11 1.71 1.40
## Proportion Var        0.16 0.14 0.11 0.09
## Cumulative Var        0.16 0.30 0.42 0.51
## Proportion Explained  0.32 0.28 0.22 0.18
## Cumulative Proportion 0.32 0.59 0.82 1.00
## 
## Mean item complexity =  1.7
## Test of the hypothesis that 4 components are sufficient.
## 
## The root mean square of the residuals (RMSR) is  0.09 
##  with the empirical chi square  1653.05  with prob <  1.1e-312 
## 
## Fit based upon off diagonal values = 0.81
round(fit.pc$values, 3)
##  [1] 3.539 1.628 1.261 1.209 1.004 0.967 0.892 0.775 0.746 0.677 0.632 0.498
## [13] 0.435 0.371 0.366
fit.pc$loadings
## 
## Loadings:
##          RC1    RC4    RC2    RC3   
## age      -0.302         0.414  0.491
## sex              0.469  0.139 -0.645
## cp        0.243 -0.621  0.272 -0.213
## trestbps -0.148         0.544  0.254
## chol             0.191  0.113  0.734
## fbs                     0.681 -0.134
## restecg         -0.195 -0.379 -0.191
## thalach   0.671 -0.261        -0.148
## exang    -0.461  0.495              
## oldpeak  -0.738  0.166  0.125       
## slope     0.819                     
## ca               0.447  0.429       
## thal             0.609              
## target    0.429 -0.697 -0.164       
## pvalues   0.394 -0.125 -0.540       
## 
##                  RC1   RC4   RC2   RC3
## SS loadings    2.415 2.114 1.712 1.397
## Proportion Var 0.161 0.141 0.114 0.093
## Cumulative Var 0.161 0.302 0.416 0.509
# Loadings with more digits
for (i in c(1,3,2,4)) { print(fit.pc$loadings[[1,i]])}
## [1] -0.301812
## [1] 0.4141713
## [1] 0.07793931
## [1] 0.4912733
# Communalities
fit.pc$communality
##       age       sex        cp  trestbps      chol       fbs   restecg   thalach 
## 0.5100523 0.6583869 0.5641369 0.3878055 0.5973798 0.4891392 0.2248950 0.5400533 
##     exang   oldpeak     slope        ca      thal    target   pvalues 
## 0.4592026 0.5884998 0.6814477 0.3940923 0.3780577 0.6967032 0.4678972
# Rotated factor scores, Notice the columns ordering: RC1, RC3, RC2 and RC4
# fit.pc$scores
head(fit.pc$scores)
##             RC1         RC4        RC2        RC3
## [1,]  0.9211371  1.43470866 -0.2316238 -0.6701889
## [2,] -1.5292302  0.40231673  1.1074114 -1.3275474
## [3,] -2.4316832  0.25523494 -0.2985576 -0.6532033
## [4,]  0.8354094  1.06924634  0.2405381 -0.2802802
## [5,] -1.0186815 -0.08943952  1.3159179  1.2129655
## [6,] -0.7248101 -0.72738032 -1.0491437  1.0132839
# Play with FA utilities
fa.parallel(heart_data) # See factor recommendation

## Parallel analysis suggests that the number of factors =  6  and the number of components =  4
fa.plot(fit.pc) # See Correlations within Factors

fa.diagram(fit.pc) # Visualize the relationship

vss(heart_data) # See Factor recommendations for a simple structure

## 
## Very Simple Structure
## Call: vss(x = heart_data)
## VSS complexity 1 achieves a maximimum of 0.51  with  1  factors
## VSS complexity 2 achieves a maximimum of 0.64  with  8  factors
## 
## The Velicer MAP achieves a minimum of 0.02  with  1  factors 
## BIC achieves a minimum of  -99.28  with  6  factors
## Sample Size adjusted BIC achieves a minimum of  -26.21  with  7  factors
## 
## Statistics by number of factors 
##   vss1 vss2   map dof chisq     prob sqresid  fit RMSEA BIC SABIC complex
## 1 0.51 0.00 0.018  90  1110 2.3e-175    11.7 0.51 0.105 486   772     1.0
## 2 0.48 0.59 0.021  76   786 1.4e-118     9.6 0.59 0.095 260   501     1.3
## 3 0.43 0.60 0.028  63   493  5.0e-68     8.4 0.64 0.082  57   257     1.5
## 4 0.44 0.63 0.034  51   322  5.9e-41     7.1 0.70 0.072 -32   130     1.6
## 5 0.43 0.62 0.042  40   198  8.2e-23     6.4 0.73 0.062 -79    48     1.7
## 6 0.41 0.61 0.057  30   109  7.5e-11     5.8 0.76 0.051 -99    -4     1.9
## 7 0.46 0.63 0.075  21    53  1.5e-04     4.9 0.79 0.038 -93   -26     1.8
## 8 0.49 0.64 0.101  13    31  3.1e-03     4.3 0.82 0.037 -59   -18     1.9
##   eChisq  SRMR eCRMS eBIC
## 1   1506 0.084 0.090  882
## 2    848 0.063 0.074  321
## 3    541 0.050 0.065  104
## 4    290 0.037 0.053  -64
## 5    172 0.028 0.046 -105
## 6     81 0.019 0.036 -127
## 7     38 0.013 0.030 -107
## 8     17 0.009 0.026  -73
# Computing Correlation Matrix
corrm.emp <- cor(heart_data)
corrm.emp
##                  age         sex          cp    trestbps        chol
## age       1.00000000 -0.10324030 -0.07196627  0.27112141  0.21982253
## sex      -0.10324030  1.00000000 -0.04111909 -0.07897377 -0.19825787
## cp       -0.07196627 -0.04111909  1.00000000  0.03817742 -0.08164102
## trestbps  0.27112141 -0.07897377  0.03817742  1.00000000  0.12797743
## chol      0.21982253 -0.19825787 -0.08164102  0.12797743  1.00000000
## fbs       0.12124348  0.02720046  0.07929359  0.18176662  0.02691716
## restecg  -0.13269617 -0.05511721  0.04358061 -0.12379409 -0.14741024
## thalach  -0.39022708 -0.04936524  0.30683928 -0.03926407 -0.02177209
## exang     0.08816338  0.13915681 -0.40151271  0.06119697  0.06738223
## oldpeak   0.20813668  0.08468656 -0.17473348  0.18743411  0.06488031
## slope    -0.16910511 -0.02666629  0.13163278 -0.12044531 -0.01424787
## ca        0.27155053  0.11172891 -0.17620647  0.10455372  0.07425934
## thal      0.07224456  0.20211721 -0.16985403  0.05894881  0.09552541
## target   -0.22932355 -0.27950076  0.43485425 -0.13877173 -0.09996559
## pvalues  -0.24580364 -0.06777006  0.06640377 -0.19108407 -0.03900630
##                   fbs     restecg      thalach       exang     oldpeak
## age       0.121243479 -0.13269617 -0.390227075  0.08816338  0.20813668
## sex       0.027200461 -0.05511721 -0.049365243  0.13915681  0.08468656
## cp        0.079293586  0.04358061  0.306839282 -0.40151271 -0.17473348
## trestbps  0.181766624 -0.12379409 -0.039264069  0.06119697  0.18743411
## chol      0.026917164 -0.14741024 -0.021772091  0.06738223  0.06488031
## fbs       1.000000000 -0.10405124 -0.008865857  0.04926057  0.01085948
## restecg  -0.104051244  1.00000000  0.048410637 -0.06560553 -0.05011425
## thalach  -0.008865857  0.04841064  1.000000000 -0.38028087 -0.34979616
## exang     0.049260570 -0.06560553 -0.380280872  1.00000000  0.31084376
## oldpeak   0.010859481 -0.05011425 -0.349796163  0.31084376  1.00000000
## slope    -0.061902374  0.08608609  0.395307843 -0.26733547 -0.57518854
## ca        0.137156259 -0.07807235 -0.207888416  0.10784854  0.22181603
## thal     -0.030129129 -0.02030400 -0.106701873  0.20958207  0.20473483
## target   -0.041163547  0.13446821  0.422895496 -0.43802855 -0.43844127
## pvalues  -0.243712291  0.11754954  0.267218689 -0.27484412 -0.26075671
##                slope          ca        thal      target     pvalues
## age      -0.16910511  0.27155053  0.07224456 -0.22932355 -0.24580364
## sex      -0.02666629  0.11172891  0.20211721 -0.27950076 -0.06777006
## cp        0.13163278 -0.17620647 -0.16985403  0.43485425  0.06640377
## trestbps -0.12044531  0.10455372  0.05894881 -0.13877173 -0.19108407
## chol     -0.01424787  0.07425934  0.09552541 -0.09996559 -0.03900630
## fbs      -0.06190237  0.13715626 -0.03012913 -0.04116355 -0.24371229
## restecg   0.08608609 -0.07807235 -0.02030400  0.13446821  0.11754954
## thalach   0.39530784 -0.20788842 -0.10670187  0.42289550  0.26721869
## exang    -0.26733547  0.10784854  0.20958207 -0.43802855 -0.27484412
## oldpeak  -0.57518854  0.22181603  0.20473483 -0.43844127 -0.26075671
## slope     1.00000000 -0.07344041 -0.09650143  0.34551175  0.27668649
## ca       -0.07344041  1.00000000  0.14576170 -0.38208529 -0.26031799
## thal     -0.09650143  0.14576170  1.00000000 -0.35128336 -0.11977944
## target    0.34551175 -0.38208529 -0.35128336  1.00000000  0.27596524
## pvalues   0.27668649 -0.26031799 -0.11977944  0.27596524  1.00000000
plot(corrm.emp)

heart_data_pca <- prcomp(heart_data, scale=TRUE)
summary(heart_data_pca)
## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5    PC6     PC7
## Standard deviation     1.8812 1.2761 1.12290 1.09975 1.00187 0.9836 0.94434
## Proportion of Variance 0.2359 0.1086 0.08406 0.08063 0.06692 0.0645 0.05945
## Cumulative Proportion  0.2359 0.3445 0.42855 0.50918 0.57610 0.6406 0.70005
##                            PC8     PC9    PC10    PC11    PC12    PC13   PC14
## Standard deviation     0.88042 0.86359 0.82302 0.79486 0.70584 0.65941 0.6087
## Proportion of Variance 0.05168 0.04972 0.04516 0.04212 0.03321 0.02899 0.0247
## Cumulative Proportion  0.75172 0.80144 0.84660 0.88872 0.92193 0.95092 0.9756
##                           PC15
## Standard deviation     0.60465
## Proportion of Variance 0.02437
## Cumulative Proportion  1.00000
plot(heart_data_pca)

# A table containing eigenvalues and %'s accounted, follows. Eigenvalues are the sdev^2
(eigen_heart_data <- round(heart_data_pca$sdev^2,3))
##  [1] 3.539 1.628 1.261 1.209 1.004 0.967 0.892 0.775 0.746 0.677 0.632 0.498
## [13] 0.435 0.371 0.366
round(fit.pc$values, 3)
##  [1] 3.539 1.628 1.261 1.209 1.004 0.967 0.892 0.775 0.746 0.677 0.632 0.498
## [13] 0.435 0.371 0.366
names(eigen_heart_data) <- paste("PC",1:14,sep="")
eigen_heart_data
##   PC1   PC2   PC3   PC4   PC5   PC6   PC7   PC8   PC9  PC10  PC11  PC12  PC13 
## 3.539 1.628 1.261 1.209 1.004 0.967 0.892 0.775 0.746 0.677 0.632 0.498 0.435 
##  PC14  <NA> 
## 0.371 0.366
sumlambdas <- sum(eigen_heart_data)
sumlambdas
## [1] 15
propvar <- round(eigen_heart_data/sumlambdas,2)
propvar
##  PC1  PC2  PC3  PC4  PC5  PC6  PC7  PC8  PC9 PC10 PC11 PC12 PC13 PC14 <NA> 
## 0.24 0.11 0.08 0.08 0.07 0.06 0.06 0.05 0.05 0.05 0.04 0.03 0.03 0.02 0.02
cumvar_heart_data <- cumsum(propvar)
cumvar_heart_data
##  PC1  PC2  PC3  PC4  PC5  PC6  PC7  PC8  PC9 PC10 PC11 PC12 PC13 PC14 <NA> 
## 0.24 0.35 0.43 0.51 0.58 0.64 0.70 0.75 0.80 0.85 0.89 0.92 0.95 0.97 0.99
matlambdas <- rbind(eigen_heart_data,propvar,cumvar_heart_data)
matlambdas
##                     PC1   PC2   PC3   PC4   PC5   PC6   PC7   PC8   PC9  PC10
## eigen_heart_data  3.539 1.628 1.261 1.209 1.004 0.967 0.892 0.775 0.746 0.677
## propvar           0.240 0.110 0.080 0.080 0.070 0.060 0.060 0.050 0.050 0.050
## cumvar_heart_data 0.240 0.350 0.430 0.510 0.580 0.640 0.700 0.750 0.800 0.850
##                    PC11  PC12  PC13  PC14  <NA>
## eigen_heart_data  0.632 0.498 0.435 0.371 0.366
## propvar           0.040 0.030 0.030 0.020 0.020
## cumvar_heart_data 0.890 0.920 0.950 0.970 0.990
rownames(matlambdas) <- c("Eigenvalues","Prop. variance","Cum. prop. variance")
rownames(matlambdas)
## [1] "Eigenvalues"         "Prop. variance"      "Cum. prop. variance"
eigvec.emp <- heart_data_pca$rotation
print(heart_data_pca)
## Standard deviations (1, .., p=15):
##  [1] 1.8812235 1.2760824 1.1229031 1.0997498 1.0018657 0.9835860 0.9443355
##  [8] 0.8804198 0.8635916 0.8230152 0.7948601 0.7058415 0.6594143 0.6087487
## [15] 0.6046523
## 
## Rotation (n x k) = (15 x 15):
##                  PC1          PC2         PC3          PC4          PC5
## age       0.24984321 -0.393171325  0.15361807 -0.079606227  0.265670994
## sex       0.10553952  0.361464951 -0.56292407 -0.074125500 -0.114973076
## cp       -0.25290181 -0.297681988 -0.27233425  0.287501040 -0.214844981
## trestbps  0.14937255 -0.428285805 -0.08966072  0.003320797 -0.287626472
## chol      0.09471237 -0.303739885  0.36580424 -0.451616085 -0.252621540
## fbs       0.08323174 -0.352032612 -0.45349568  0.053831609  0.164408972
## restecg  -0.11304826  0.221331435  0.12767202  0.256130125  0.342746434
## thalach  -0.35120347 -0.052339607 -0.21840651 -0.179412840 -0.317999665
## exang     0.32415364  0.220331295  0.07806227 -0.022361829  0.005896473
## oldpeak   0.35607178  0.026082571  0.08488007  0.327357009 -0.289582165
## slope    -0.31595929  0.003041486 -0.12301238 -0.505501618  0.267645055
## ca        0.25157744 -0.097916696 -0.22954447 -0.269823895  0.408595108
## thal      0.20652409  0.204043741 -0.13527341 -0.335630911 -0.368814119
## target   -0.41282284 -0.179816304  0.06652019  0.170951351  0.019739549
## pvalues  -0.28299732  0.204971521  0.26456660 -0.151597786 -0.137753273
##                  PC6         PC7         PC8          PC9        PC10
## age      -0.21099100  0.26821531 -0.15155247 -0.408110421  0.19417752
## sex      -0.01573767  0.23701769 -0.08262481 -0.168878000  0.26639344
## cp       -0.25659179  0.17820935  0.22296981 -0.279683792 -0.02361997
## trestbps -0.18893939 -0.21656930 -0.71243542  0.154434372 -0.05839473
## chol      0.00765153 -0.13595235  0.43397344 -0.001779595  0.16942700
## fbs       0.26728749 -0.40754202  0.19535830  0.076039033  0.53037556
## restecg  -0.59801390 -0.52674985  0.04044943 -0.005495562  0.15025462
## thalach  -0.06800240 -0.18133486  0.07396642  0.357952185 -0.17614283
## exang     0.34264321 -0.38645630 -0.11687031 -0.140201440 -0.09372719
## oldpeak  -0.19547803  0.07698159  0.13307982  0.295776684  0.05430526
## slope    -0.03970721 -0.09647195 -0.20922289 -0.130127018 -0.11533282
## ca       -0.30228492  0.22576669  0.17314099  0.485363761 -0.16917415
## thal     -0.40051798 -0.20120688  0.12910253 -0.363589928 -0.04537439
## target    0.04533678 -0.04349071  0.06854687 -0.151837013 -0.09522989
## pvalues  -0.09440683  0.20093115 -0.22985466  0.232388921  0.67747496
##                 PC11         PC12         PC13         PC14         PC15
## age      -0.01932611 -0.047437714  0.537390771  0.119500844 -0.181936765
## sex       0.49687698 -0.200422930  0.081132788 -0.247441644 -0.070742161
## cp        0.19803390  0.501342315 -0.212736537  0.287046735  0.011677345
## trestbps  0.12382502 -0.037320096 -0.223082003 -0.135805308 -0.033488347
## chol      0.46644446 -0.104602836 -0.148672821 -0.104503840  0.008007358
## fbs      -0.24642535 -0.007115338  0.014804407 -0.005539555  0.098767188
## restecg   0.24899625 -0.110129987 -0.002956789  0.039985565 -0.061566344
## thalach   0.03474022 -0.078479112  0.575846005  0.251362884 -0.297692793
## exang     0.22884343  0.627432700  0.192151175  0.005228984 -0.218876077
## oldpeak   0.03231800  0.134505256  0.388800475 -0.148989761  0.574044141
## slope     0.10427223  0.182241283  0.121999014  0.039393013  0.642222782
## ca       -0.01372112  0.307189037 -0.116035804 -0.211876051 -0.212706280
## thal     -0.52003280  0.053121250 -0.053353679 -0.137868206 -0.042544817
## target   -0.04629167  0.116613648  0.188933508 -0.812507195 -0.104068366
## pvalues  -0.14012863  0.346124807 -0.026412846 -0.008674171 -0.103143000
# Taking the first four PCs to generate linear combinations for all the variables with four factors
pcafactors.emp <- eigvec.emp[,1:4]
pcafactors.emp
##                  PC1          PC2         PC3          PC4
## age       0.24984321 -0.393171325  0.15361807 -0.079606227
## sex       0.10553952  0.361464951 -0.56292407 -0.074125500
## cp       -0.25290181 -0.297681988 -0.27233425  0.287501040
## trestbps  0.14937255 -0.428285805 -0.08966072  0.003320797
## chol      0.09471237 -0.303739885  0.36580424 -0.451616085
## fbs       0.08323174 -0.352032612 -0.45349568  0.053831609
## restecg  -0.11304826  0.221331435  0.12767202  0.256130125
## thalach  -0.35120347 -0.052339607 -0.21840651 -0.179412840
## exang     0.32415364  0.220331295  0.07806227 -0.022361829
## oldpeak   0.35607178  0.026082571  0.08488007  0.327357009
## slope    -0.31595929  0.003041486 -0.12301238 -0.505501618
## ca        0.25157744 -0.097916696 -0.22954447 -0.269823895
## thal      0.20652409  0.204043741 -0.13527341 -0.335630911
## target   -0.41282284 -0.179816304  0.06652019  0.170951351
## pvalues  -0.28299732  0.204971521  0.26456660 -0.151597786
# Multiplying each column of the eigenvector’s matrix by the square-root of the corresponding eigenvalue in order to get the factor loadings
unrot.fact.emp <- sweep(pcafactors.emp,MARGIN=2,heart_data_pca$sdev[1:4],`*`)
unrot.fact.emp
##                 PC1          PC2         PC3          PC4
## age       0.4700109 -0.501719027  0.17249821 -0.087546934
## sex       0.1985434  0.461259080 -0.63210918 -0.081519506
## cp       -0.4757648 -0.379866760 -0.30580497  0.316179218
## trestbps  0.2810032 -0.546527999 -0.10068030  0.003652046
## chol      0.1781751 -0.387597137  0.41076271 -0.496664710
## fbs       0.1565775 -0.449222637 -0.50923170  0.059201302
## restecg  -0.2126691  0.282437160  0.14336330  0.281679060
## thalach  -0.6606922 -0.066789653 -0.24524935 -0.197309240
## exang     0.6098054  0.281160899  0.08765636 -0.024592418
## oldpeak   0.6698506  0.033283511  0.09531209  0.360010813
## slope    -0.5943900  0.003881187 -0.13813098 -0.555925315
## ca        0.4732734 -0.124949777 -0.25775619 -0.296738781
## thal      0.3885180  0.260376637 -0.15189893 -0.369110036
## target   -0.7766120 -0.229460429  0.07469573  0.188003718
## pvalues  -0.5323812  0.261560560  0.29708266 -0.166719639
# Computing communalities
communalities.emp <- rowSums(unrot.fact.emp^2)
communalities.emp
##       age       sex        cp  trestbps      chol       fbs   restecg   thalach 
## 0.5100523 0.6583869 0.5641369 0.3878055 0.5973798 0.4891392 0.2248950 0.5400533 
##     exang   oldpeak     slope        ca      thal    target   pvalues 
## 0.4592026 0.5884998 0.6814477 0.3940923 0.3780577 0.6967032 0.4678972
# Performing the varimax rotation. The default in the varimax function is norm=TRUE thus, Kaiser normalization is carried out
rot.fact.emp <- varimax(unrot.fact.emp)

#View(unrot.fact.emp)
rot.fact.emp
## $loadings
## 
## Loadings:
##          PC1    PC2    PC3    PC4   
## age       0.302 -0.414  0.491       
## sex             -0.139 -0.645 -0.469
## cp       -0.243 -0.272 -0.213  0.621
## trestbps  0.148 -0.544  0.254       
## chol            -0.113  0.734 -0.191
## fbs             -0.681 -0.134       
## restecg          0.379 -0.191  0.195
## thalach  -0.671        -0.148  0.261
## exang     0.461               -0.495
## oldpeak   0.738 -0.125        -0.166
## slope    -0.819                     
## ca              -0.429        -0.447
## thal                          -0.609
## target   -0.429  0.164         0.697
## pvalues  -0.394  0.540         0.125
## 
##                  PC1   PC2   PC3   PC4
## SS loadings    2.415 1.712 1.397 2.114
## Proportion Var 0.161 0.114 0.093 0.141
## Cumulative Var 0.161 0.275 0.368 0.509
## 
## $rotmat
##            [,1]       [,2]       [,3]       [,4]
## [1,] 0.71782434 -0.3439057  0.1320701 -0.5907745
## [2,] 0.05293757  0.7043005 -0.5337905 -0.4650012
## [3,] 0.27192129  0.6176082  0.7258728  0.1331452
## [4,] 0.63873676  0.0651898 -0.4131996  0.6457799
# The print method of varimax omits loadings less than abs(0.1). In order to display all the loadings, it is necessary to ask explicitly the contents of the object $loadings
fact.load.emp <- rot.fact.emp$loadings[1:14,1:4]
fact.load.emp
##                  PC1         PC2          PC3         PC4
## age       0.30181198 -0.41417126  0.491273341 -0.07793931
## sex      -0.05701621 -0.13912528 -0.645141082 -0.46858635
## cp       -0.24282439 -0.27217812 -0.212685649  0.62117397
## trestbps  0.14773462 -0.54350145  0.254253460  0.07707997
## chol     -0.09816287 -0.11294735  0.733810412 -0.19107326
## fbs      -0.01204250 -0.68088200 -0.133629387  0.08681632
## restecg   0.08119482  0.37896370 -0.191175659  0.19529667
## thalach  -0.67051383  0.01584529 -0.148097699  0.26130531
## exang     0.46074471  0.04084030  0.004244965 -0.49520795
## oldpeak   0.73846654 -0.12458929 -0.008871091 -0.16602945
## slope    -0.81911287  0.08559617  0.048869743 -0.02805113
## ca        0.07348527 -0.42930031  0.064716409 -0.44744303
## thal      0.01560266 -0.06810639 -0.045418152 -0.60919043
## target   -0.42922186  0.16386084 -0.003546873  0.69685639
# Computing the rotated factor scores for the 30 European Countries. Notice that signs are reversed for factors F2 (PC2), F3 (PC3) and F4 (PC4)
scale.emp <- scale(heart_data[1:14])
# scale.emp
head(scale.emp)
##             age        sex         cp   trestbps        chol        fbs
## [1,] -0.2683056  0.6611813 -0.9153086 -0.3774513 -0.65901038 -0.4186735
## [2,] -0.1580799  0.6611813 -0.9153086  0.4788735 -0.83345431  2.3861656
## [3,]  1.7157579  0.6611813 -0.9153086  0.7643151 -1.39555140 -0.4186735
## [4,]  0.7237261  0.6611813 -0.9153086  0.9355801 -0.83345431 -0.4186735
## [5,]  0.8339519 -1.5109689 -0.9153086  0.3646969  0.93036760  2.3861656
## [6,]  0.3930489 -1.5109689 -0.9153086 -1.8046593  0.03876532 -0.4186735
##        restecg    thalach      exang     oldpeak      slope         ca
## [1,]  0.890820  0.8209198 -0.7119396 -0.06085868  0.9949476  1.2086307
## [2,] -1.003559  0.2558430  1.4032432  1.72629436 -2.2425804 -0.7316143
## [3,]  0.890820 -1.0481803  1.4032432  1.30078173 -2.2425804 -0.7316143
## [4,]  0.890820  0.5166477 -0.7119396 -0.91188394  0.9949476  0.2385082
## [5,]  0.890820 -1.8740617 -0.7119396  0.70506405 -0.6238164  2.1787531
## [6,] -1.003559 -1.1785826 -0.7119396 -0.06085868 -0.6238164 -0.7316143
##            thal     target
## [1,]  1.1150813 -1.0261968
## [2,]  1.1150813 -1.0261968
## [3,]  1.1150813 -1.0261968
## [4,]  1.1150813 -1.0261968
## [5,] -0.5510387 -1.0261968
## [6,] -0.5510387  0.9735213
head(as.matrix(scale.emp)%*%fact.load.emp%*%solve(t(fact.load.emp)%*%fact.load.emp))
##             PC1        PC2        PC3        PC4
## [1,] -0.9349143  0.2652563 -0.6565332 -1.4395593
## [2,]  1.6446757 -1.3892331 -1.4419745 -0.3616709
## [3,]  2.4746866  0.1935791 -0.6958274 -0.2400944
## [4,] -0.8848376 -0.1198755 -0.2312880 -1.0866489
## [5,]  1.0805632 -1.4669814  1.1516297  0.1112267
## [6,]  0.5979801  1.3587567  1.1389951  0.6827263
# Different way of PCA Method


library(factoextra)
library(FactoMineR)
library(ggfortify)
## Registered S3 method overwritten by 'ggfortify':
##   method          from   
##   autoplot.glmnet parsnip
library(psych)
library(corrplot)
library(devtools)
## Loading required package: usethis
## 
## Attaching package: 'devtools'
## 
## The following object is masked from 'package:recipes':
## 
##     check
res.pca <- PCA(heart[,1:13], graph = FALSE)
print(res.pca)
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 1025 individuals, described by 13 variables
## *The results are available in the following objects:
## 
##    name               description                          
## 1  "$eig"             "eigenvalues"                        
## 2  "$var"             "results for the variables"          
## 3  "$var$coord"       "coord. for the variables"           
## 4  "$var$cor"         "correlations variables - dimensions"
## 5  "$var$cos2"        "cos2 for the variables"             
## 6  "$var$contrib"     "contributions of the variables"     
## 7  "$ind"             "results for the individuals"        
## 8  "$ind$coord"       "coord. for the individuals"         
## 9  "$ind$cos2"        "cos2 for the individuals"           
## 10 "$ind$contrib"     "contributions of the individuals"   
## 11 "$call"            "summary statistics"                 
## 12 "$call$centre"     "mean of the variables"              
## 13 "$call$ecart.type" "standard error of the variables"    
## 14 "$call$row.w"      "weights for the individuals"        
## 15 "$call$col.w"      "weights for the variables"
# Visualize and Interpret PCA using these functions 

#get_eigenvalue(res.pca): Extract the eigenvalues/variances of principal components
#fviz_eig(res.pca): Visualize the eigenvalues
#get_pca_ind(res.pca), get_pca_var(res.pca): Extract the results for individuals and variables, respectively.
#fviz_pca_ind(res.pca), fviz_pca_var(res.pca): Visualize the results individuals and variables, respectively.
#fviz_pca_biplot(res.pca): Make a biplot of individuals and variables.

eig.val <- get_eigenvalue(res.pca)
eig.val
##        eigenvalue variance.percent cumulative.variance.percent
## Dim.1   2.7830596        21.408151                    21.40815
## Dim.2   1.5585672        11.988978                    33.39713
## Dim.3   1.2029097         9.253152                    42.65028
## Dim.4   1.1669979         8.976907                    51.62719
## Dim.5   0.9959175         7.660904                    59.28809
## Dim.6   0.9625088         7.403914                    66.69201
## Dim.7   0.8783455         6.756504                    73.44851
## Dim.8   0.7679664         5.907433                    79.35594
## Dim.9   0.7292387         5.609529                    84.96547
## Dim.10  0.6315583         4.858141                    89.82361
## Dim.11  0.5222838         4.017568                    93.84118
## Dim.12  0.4316169         3.320130                    97.16131
## Dim.13  0.3690297         2.838690                   100.00000
fviz_eig(res.pca, addlabels = TRUE, ylim = c(0, 50))

var <- get_pca_var(res.pca)
#var$coord: coordinates of variables to create a scatter plot
#var$cos2: represents the quality of representation for variables on the factor map. It’s calculated as the squared coordinates: var.cos2 = var.coord * var.coord.
#var$contrib: contains the contributions (in percentage) of the variables to the principal components. 
#The contribution of a variable (var) to a given principal component is (in percentage) : (var.cos2 * 100) / (total cos2 of the component).
var
## Principal Component Analysis Results for variables
##  ===================================================
##   Name       Description                                    
## 1 "$coord"   "Coordinates for the variables"                
## 2 "$cor"     "Correlations between variables and dimensions"
## 3 "$cos2"    "Cos2 for the variables"                       
## 4 "$contrib" "contributions of the variables"
# Coordinates
head(var$coord)
##               Dim.1      Dim.2       Dim.3        Dim.4       Dim.5
## age       0.5143487  0.4990651 -0.07634702  0.059195259 -0.32810646
## sex       0.1319762 -0.4752962  0.67194202 -0.003787684  0.04762913
## cp       -0.4770016  0.3416973  0.21289872 -0.447663818  0.13043284
## trestbps  0.2968919  0.5471188  0.17124649 -0.144468670  0.21565878
## chol      0.2118213  0.4622542 -0.26740797  0.505462697  0.30811307
## fbs       0.1360315  0.3960127  0.49837684 -0.178310347 -0.16173188
# Cos2: quality on the factore map
head(var$cos2)
##               Dim.1     Dim.2       Dim.3        Dim.4       Dim.5
## age      0.26455456 0.2490659 0.005828868 3.504079e-03 0.107653851
## sex      0.01741773 0.2259065 0.451506072 1.434655e-05 0.002268534
## cp       0.22753051 0.1167570 0.045325866 2.004029e-01 0.017012726
## trestbps 0.08814481 0.2993390 0.029325360 2.087120e-02 0.046508710
## chol     0.04486824 0.2136789 0.071507020 2.554925e-01 0.094933662
## fbs      0.01850458 0.1568261 0.248379474 3.179458e-02 0.026157199
# Contributions to the principal components
head(var$contrib)
##              Dim.1     Dim.2      Dim.3        Dim.4      Dim.5
## age      9.5058889 15.980443  0.4845641  0.300264349 10.8095146
## sex      0.6258482 14.494497 37.5344940  0.001229356  0.2277834
## cp       8.1755530  7.491305  3.7680190 17.172515022  1.7082465
## trestbps 3.1671910 19.206040  2.4378688  1.788451900  4.6699359
## chol     1.6121913 13.709958  5.9445044 21.893144168  9.5322814
## fbs      0.6649006 10.062196 20.6482226  2.724476118  2.6264423
#The plot Below is also known as variable correlation plots. It shows the relationships between all variables. It can be interpreted as follow:

#Positively correlated variables are grouped together.
#Negatively correlated variables are positioned on opposite sides of the plot origin (opposed quadrants).
#The distance between variables and the origin measures the quality of the variables on the factor map. 
#Variables that are away from the origin are well represented on the factor map.

# Correlation circle
fviz_pca_var(res.pca, col.var = "black")

# Quality of representation


corrplot(var$cos2, is.corr=FALSE)

# Total cos2 of variables on Dim.1 and Dim.2
#A high cos2 indicates a good representation of the variable on the principal component. 
#In this case the variable is positioned close to the circumference of the correlation circle.
#A low cos2 indicates that the variable is not perfectly represented by the PCs. 
#In this case the variable is close to the center of the circle.

fviz_cos2(res.pca, choice = "var", axes = 1:2)

fviz_pca_var(res.pca, col.var = "cos2",
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), 
             repel = TRUE # Avoid text overlapping
)

# Change the transparency by cos2 values
fviz_pca_var(res.pca, alpha.var = "cos2")

corrplot(var$contrib, is.corr=FALSE)

# Contributions of variables to PC1
fviz_contrib(res.pca, choice = "var", axes = 1, top = 10)

# Contributions of variables to PC2
fviz_contrib(res.pca, choice = "var", axes = 2, top = 10)

fviz_pca_var(res.pca, col.var = "contrib",
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07")
)

fviz_pca_var(res.pca, alpha.var = "contrib")

fviz_pca_ind(res.pca,
             geom.ind = "point", # show points only (nbut not "text")
             col.ind = heart$target, # color by groups
             palette = c("#00AFBB", "#E7B800", "#FC4E07"),
             addEllipses = TRUE, # Concentration ellipses
             legend.title = "Groups"
)

# Description of PC

res.desc <- dimdesc(res.pca, axes = c(1,2,3,4,5), proba = 0.05)
# Description of dimension 1
res.desc$Dim.1
## 
## Link between the variable and the continuous variables (R-square)
## =================================================================================
##          correlation       p.value
## oldpeak    0.7023260 3.521895e-153
## exang      0.6083286 8.959399e-105
## age        0.5143487  2.660905e-70
## ca         0.4410908  4.856320e-50
## thal       0.3665971  5.881322e-34
## trestbps   0.2968919  2.643038e-22
## chol       0.2118213  7.323225e-12
## fbs        0.1360315  1.241619e-05
## sex        0.1319762  2.248189e-05
## restecg   -0.2151083  3.401365e-12
## cp        -0.4770016  2.334808e-59
## slope     -0.6327058 1.004959e-115
## thalach   -0.6951050 8.537505e-149
res.desc$Dim.2
## 
## Link between the variable and the continuous variables (R-square)
## =================================================================================
##          correlation      p.value
## trestbps  0.54711885 4.316495e-81
## age       0.49906506 1.168945e-65
## chol      0.46225416 2.137136e-55
## fbs       0.39601272 8.008825e-40
## cp        0.34169727 1.906009e-29
## ca        0.13356227 1.785957e-05
## thalach   0.11386589 2.592693e-04
## slope     0.07643698 1.437421e-02
## oldpeak  -0.08404940 7.094501e-03
## thal     -0.23683979 1.555433e-14
## restecg  -0.30497917 1.666111e-23
## exang    -0.32276479 2.793006e-26
## sex      -0.47529619 6.860180e-59
res.desc$Dim.3
## 
## Link between the variable and the continuous variables (R-square)
## =================================================================================
##          correlation       p.value
## sex       0.67194202 1.430340e-135
## fbs       0.49837684  1.867951e-65
## ca        0.34634137  2.946136e-30
## thal      0.28697180  6.967823e-21
## thalach   0.21802204  1.705632e-12
## cp        0.21289872  5.703323e-12
## trestbps  0.17124649  3.454787e-08
## slope     0.16016684  2.537697e-07
## age      -0.07634702  1.448987e-02
## restecg  -0.26380120  8.847331e-18
## chol     -0.26740797  3.041939e-18
res.desc$Dim.4
## 
## Link between the variable and the continuous variables (R-square)
## =================================================================================
##          correlation      p.value
## slope     0.52474189 1.349658e-73
## chol      0.50546270 1.422337e-67
## thal      0.35745956 2.962931e-32
## ca        0.22956613 1.004412e-13
## exang     0.15192611 1.026625e-06
## thalach   0.09501493 2.325708e-03
## trestbps -0.14446867 3.417390e-06
## fbs      -0.17831035 9.037288e-09
## restecg  -0.19530646 2.862441e-10
## oldpeak  -0.35893217 1.588975e-32
## cp       -0.44766382 1.155834e-51
res.desc$Dim.5
## 
## Link between the variable and the continuous variables (R-square)
## =================================================================================
##          correlation      p.value
## thal      0.32695877 5.805262e-27
## thalach   0.31981038 8.322597e-26
## chol      0.30811307 5.575289e-24
## trestbps  0.21565878 2.987761e-12
## oldpeak   0.21522758 3.307229e-12
## cp        0.13043284 2.805582e-05
## exang     0.09010505 3.887815e-03
## fbs      -0.16173188 1.930194e-07
## slope    -0.23206595 5.328331e-14
## age      -0.32810646 3.760519e-27
## restecg  -0.38207963 5.714017e-37
## ca       -0.48777132 2.240638e-62
# Graph of Indiviuals
ind <- get_pca_ind(res.pca)
ind
## Principal Component Analysis Results for individuals
##  ===================================================
##   Name       Description                       
## 1 "$coord"   "Coordinates for the individuals" 
## 2 "$cos2"    "Cos2 for the individuals"        
## 3 "$contrib" "contributions of the individuals"
## Principal Component Analysis Results for individuals
##  ===================================================
##   Name       Description                       
## 1 "$coord"   "Coordinates for the individuals" 
## 2 "$cos2"    "Cos2 for the individuals"        
## 3 "$contrib" "contributions of the individuals"
#To get access to the different components, use this:

# Coordinates of individuals
head(ind$coord)
##        Dim.1      Dim.2      Dim.3       Dim.4      Dim.5
## 1 -0.5088278 -1.1281932  0.9623453  1.11441460 -0.8296152
## 2  2.6039019 -0.5451647  1.4705356 -1.52841708  1.6337808
## 3  3.0525119 -1.3222916 -0.4463478 -1.57977920  0.1247189
## 4 -0.4794296 -0.2946210  0.8184706  0.96132306 -0.7327194
## 5  2.1727510  1.9677780 -0.3847231 -0.25977074 -2.4420092
## 6  0.1703675 -0.1136465 -2.0357571  0.09181311 -0.3800811
# Quality of individuals
head(ind$cos2)
##         Dim.1       Dim.2       Dim.3        Dim.4       Dim.5
## 1 0.033283820 0.163628337 0.119056592 0.1596559580 0.088479973
## 2 0.326485488 0.014311003 0.104127567 0.1124859801 0.128529324
## 3 0.482643036 0.090566128 0.010319486 0.1292718899 0.000805705
## 4 0.027891023 0.010532759 0.081286959 0.1121381539 0.065146339
## 5 0.221856574 0.181972015 0.006955829 0.0031712673 0.280250896
## 6 0.002674068 0.001189902 0.381813258 0.0007766198 0.013309186
# Contributions of individuals
head(ind$contrib)
##         Dim.1        Dim.2      Dim.3        Dim.4       Dim.5
## 1 0.009076016 0.0796741757 0.07511125 0.1038244485 0.067422708
## 2 0.237685600 0.0186039875 0.17538571 0.1952944105 0.261481127
## 3 0.326639234 0.1094473104 0.01615808 0.2086406040 0.001523763
## 4 0.008057557 0.0054334800 0.05433120 0.0772582331 0.052593018
## 5 0.165490673 0.2423833660 0.01200438 0.0056413949 0.584180915
## 6 0.001017483 0.0008084676 0.33612053 0.0007047181 0.014151589
fviz_pca_ind(res.pca)

fviz_pca_ind(res.pca, col.ind = "cos2", 
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE # Avoid text overlapping (slow if many points)
)

fviz_pca_ind(res.pca, pointsize = "cos2", 
             pointshape = 21, fill = "#E7B800",
             repel = TRUE # Avoid text overlapping (slow if many points)
)

fviz_pca_ind(res.pca, col.ind = "cos2", pointsize = "cos2",
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE # Avoid text overlapping (slow if many points)
)

fviz_cos2(res.pca, choice = "ind")

# Total contribution on PC1 and PC2
fviz_contrib(res.pca, choice = "ind", axes = 1:2)

# Create a random continuous variable of length 23,
# Same length as the number of active individuals in the PCA
set.seed(123)
my.cont.var <- rnorm(1025)
# Color individuals by the continuous variable
fviz_pca_ind(res.pca, col.ind = my.cont.var,
             gradient.cols = c("blue", "yellow", "red"),
             legend.title = "Cont.Var")

heart$target <- as.factor(heart$target)
fviz_pca_ind(res.pca,
             geom.ind = "point", # show points only (nbut not "text")
             col.ind = heart$target, # color by groups
             palette = c("#00AFBB", "#E7B800", "#FC4E07"),
             addEllipses = TRUE, # Concentration ellipses
             legend.title = "Groups"
)

fviz_pca_ind(res.pca, geom.ind = "point", col.ind = heart$target, 
             palette = c("#00AFBB", "#E7B800", "#FC4E07"),
             addEllipses = TRUE, ellipse.type = "confidence",
             legend.title = "Groups"
)

fviz_pca_ind(res.pca,
             label = "none", # hide individual labels
             habillage = heart$target, # color by groups
             addEllipses = TRUE, # Concentration ellipses
             palette = "jco"
)

fviz_pca_var(res.pca, geom.var = c("point", "text"))

# Show individuals text labels only
fviz_pca_ind(res.pca, geom.ind =  "text")

# Change the size of arrows an labels
fviz_pca_var(res.pca, arrowsize = 1, labelsize = 5, 
             repel = TRUE)

# Change points size, shape and fill color
# Change labelsize
fviz_pca_ind(res.pca, 
             pointsize = 3, pointshape = 21, fill = "lightblue",
             labelsize = 5, repel = TRUE)

fviz_pca_ind(res.pca,
             geom.ind = "point", # show points only (but not "text")
             group.ind = heart$target, # color by groups
             legend.title = "Groups",
             mean.point = FALSE)

fviz_pca_ind(res.pca,
             geom.ind = "point", # show points only (but not "text")
             group.ind = heart$target, # color by groups
             legend.title = "Groups",
             mean.point = TRUE)

fviz_pca_var(res.pca, axes.linetype = "blank")

ind.p <- fviz_pca_ind(res.pca, geom = "point", col.ind = heart$target)
ggpubr::ggpar(ind.p,
              title = "Principal Component Analysis",
              subtitle = "Heart disease data",
              caption = "Source: factoextra",
              xlab = "PC1", ylab = "PC2",
              legend.title = "Disease or No Disease", legend.position = "top",
              ggtheme = theme_gray(), palette = "jco"
)

fviz_pca_biplot(res.pca, repel = TRUE,col.ind = heart$target,
                col.var = "#2E9FDF", # Variables color
)

fviz_pca_biplot(res.pca, 
                col.ind = heart$target, palette = "jco", 
                addEllipses = TRUE, label = "var",
                col.var = "black", repel = TRUE,
                legend.title = "Target") 

fviz_pca_biplot(res.pca, 
                # Fill individuals by groups
                geom.ind = "point",
                pointshape = 21,
                pointsize = 2.5,
                fill.ind = heart$target,
                col.ind = "black",
                # Color variable by groups
                legend.title = list(fill = "Target", color = "Clusters"),
                repel = TRUE        # Avoid label overplotting
)+
  ggpubr::fill_palette("jco")+      # Indiviual fill color
  ggpubr::color_palette("npg")      # Variable colors

fviz_pca_biplot(res.pca, 
                # Individuals
                geom.ind = "point",
                fill.ind = heart$target, col.ind = "black",
                pointshape = 21, pointsize = 2,
                palette = "jco",
                addEllipses = TRUE,
                # Variables
                alpha.var ="contrib", col.var = "contrib",
                gradient.cols = "RdYlBu",
                
                legend.title = list(fill = "Survivorship", color = "Contrib",
                                    alpha = "Contrib")
)

# Conclusion 

# From the above observations, we got the significant features which is correct:
# "sex", "cp", "restecg", "exang", "slope", "ca", "thal"  


# 1- Males are more vulnerable to be diagnosed with heart disease than females.
# 
# 2- Chest Pain is most common factor that leads to heart disease for males and females.
# 
# 3- Maximum heart rate achieved is the highest cause factor to cause heart disease for females where is Thalassemia is the highest to cause heart disease for males.
# 
# 4- There is a high association between chest pain and heart disease diagnosis.

# Limitation
# The dataset is missing some useful information such as smoking, obesity or family history 
# that can help in predicting.


# The following conditions are associated with increased prevalence of heart disease:

# Asymptomatic angina chest pain (relative to typical angina chest pain, atypical angina pain, or non-angina pain)
# Presence of exercise induced angina
# Lower fasting blood sugar
# Flat or down-sloaping peak exercise ST segment
# Presence of left ventricle hypertrophy
# Male
# Higher thelassemia score
# Higher age
# Lower max heart rate achieved
# Higher resting blood pressure
# Higher cholesterol
# Higher ST depression induced by exercise relative to rest